from scan_list_comp.Rmd Compile lists of all GluCEST scans.
To DO: - confirm that newer scan dates = Terra - pull scan IDs
7T list from David (up to ~ summer 2019)
glu_old <- read.csv("data/7t_dates.csv", na.strings = "") # read in 7T list from David (up to ~ summer 2019)
summary(glu_old)
## BBLID X7T_date ONM_7T_date ONM_SCANID
## Min. : 12882 Length:134 Length:134 Min. : 8559
## 1st Qu.: 19792 Class :character Class :character 1st Qu.: 8928
## Median : 86344 Mode :character Mode :character Median : 9048
## Mean : 64860 Mean : 9159
## 3rd Qu.: 98273 3rd Qu.: 9334
## Max. :139272 Max. :10091
## NA's :68
## Terra_SCANID
## Length:134
## Class :character
## Mode :character
##
##
##
##
# clean up formatting
glu_old <- glu_old %>%
mutate(BBLID=as.character(BBLID),
X7T_date = gsub(",.*","", X7T_date),
Terra.7T_date = as.Date(X7T_date,format ="%m/%d/%y"),
ONM.7T_date = as.Date(ONM_7T_date,format ="%m/%d/%y"),
ONM.SCANID=as.character(ONM_SCANID)) %>%
rename(Terra.SCANID=Terra_SCANID) %>%
dplyr::select(-X7T_date, -ONM_7T_date, -ONM_SCANID)
# make long
glu_old.long <- glu_old %>% pivot_longer(2:5, names_sep = '[.]', names_to=c("scanner", ".value")) %>%
drop_na("7T_date")
7T list from Arianna’s LongGluCEST study (pulled April 2022)
glu_ar <- read.csv("data/longglucest_scans.csv", header=T) # read in 7T list from Arianna LongGluCEST study (pulled April 2022)
summary(glu_ar)
## BBLID baseline_clinical base_7t follow_7t
## Min. : 20082 Length:35 Length:35 Length:35
## 1st Qu.: 90350 Class :character Class :character Class :character
## Median : 96465 Mode :character Mode :character Mode :character
## Mean : 92511
## 3rd Qu.:116068
## Max. :135085
#clean up formatting
glu_ar <- glu_ar %>%
mutate(BBLID=as.character(BBLID),
base_7t = as.Date(base_7t,format ="%m/%d/%y"),
follow_7t = as.Date(follow_7t,format ="%m/%d/%y")) %>%
dplyr::select(-baseline_clinical)
# make long
glu_ar.long <- gather(glu_ar, scanner, "7T_date", base_7t:follow_7t, factor_key=TRUE) %>%
drop_na("7T_date") %>%
mutate(scanner= fct_collapse(scanner, Terra = c("base_7t", "follow_7t")))
7T list from Heather’s Aging study
glu_age <- read.csv("data/7tglucestage_scans.csv", header=T) # read in 7T list from Heather Aging study (pulled April 2022)
#clean up
glu_age <- glu_age %>%
transmute(BBLID=as.character(BBLID),
"7T_date" = as.Date(DOSCAN,format ="%m/%d/%y"),
scanner = as.factor("Terra"),
SCANID=as.character(SCANID))
summary(glu_age)
## BBLID 7T_date scanner SCANID
## Length:50 Min. :2021-04-15 Terra:50 Length:50
## Class :character 1st Qu.:2021-07-14 Class :character
## Mode :character Median :2021-10-27 Mode :character
## Mean :2021-10-10
## 3rd Qu.:2021-12-12
## Max. :2022-04-01
Join all lists:
#join long versions of old and Longitudinal scan lists
long1 <- merge(x=glu_old.long, y=glu_ar.long, by=c("BBLID", "7T_date", "scanner"),all=TRUE)
str(long1)
## 'data.frame': 182 obs. of 4 variables:
## $ BBLID : chr "100522" "102041" "102041" "105176" ...
## $ 7T_date: Date, format: "2021-12-11" "2018-06-15" ...
## $ scanner: chr "Terra" "Terra" "Terra" "ONM" ...
## $ SCANID : chr NA "10821" NA "9332" ...
#join in Age study scan list
all_7t.long <- merge(x=long1, y=glu_age, by=c("BBLID", "7T_date", "scanner", "SCANID"), all=TRUE)
all_7t.long <- all_7t.long %>% rename(date_7t = "7T_date")
str(all_7t.long)
## 'data.frame': 232 obs. of 4 variables:
## $ BBLID : chr "100522" "100898" "102041" "102041" ...
## $ date_7t: Date, format: "2021-12-11" "2021-11-30" ...
## $ scanner: chr "Terra" "Terra" "Terra" "Terra" ...
## $ SCANID : chr NA "11977" "10821" NA ...
dup <- duplicated(all_7t.long[,1:2])
any(dup) #no duplicates!
## [1] FALSE
Next make a wide version to easily add in clinical/subject-level data.
all_7t.count <- all_7t.long %>%
group_by(BBLID) %>%
arrange(all_7t.long$date_7t) %>%
mutate(count = as.character(row_number(BBLID)),
scanner=as.factor(scanner)) %>%
ungroup() # add counts
all_7t.wide <- pivot_wider(all_7t.count, id_cols=BBLID, names_from = count, values_from=c(date_7t, scanner, SCANID)) # pivot
colnames(all_7t.wide)[2:ncol(all_7t.wide)] <- paste("scan", colnames(all_7t.wide[,c(2:ncol(all_7t.wide))]), sep = "_") #rename cols
# #write final list to csv
# write.csv(all_7t.wide, "data/all_7T_april22.csv", row.names=FALSE, na="")
# save(all_7t.wide, file="data/all_7T_april22.Rdata")
n_distinct(all_7t.wide$BBLID)
## [1] 199
Load diagnosis data from oracle ID and remove 22q subj
#load diagnosis data
redcap <- read.csv("data/oracle_diagnoses_all.csv", header = TRUE, stringsAsFactors = F, na = c("", "null", "..", ".")) %>%
dplyr::select(BBLID, DODIAGNOSIS, HSTATUS, AGEONSET_CLINICRISK | contains("AXIS1_")) %>% #get manageable subset of data
mutate(DODIAGNOSIS = as.Date(DODIAGNOSIS),
BBLID=as.factor(BBLID))
#make list of 22q subj
q22 <- subset(redcap, HSTATUS == "22q",select = BBLID) %>% unique()
#remove 22q participants from scan list, that way it'll apply to all subsequent merges w/ clinical data
all_7t.wide <- all_7t.wide %>% subset(!(BBLID %in% q22$BBLID)) %>% as.data.frame()#drop 22q sub
all_7t.long <- all_7t.long %>% subset(!(BBLID %in% q22$BBLID)) %>% as.data.frame()#drop 22q sub
There are 197 (197) unique subjects who have 7T scans, excluding 22q pts ID’d from Kosha’s RedCap data.
Load SIPS data
#load SIPS
sips <- fread("data/oracle_sips_all.csv", na.strings="null", header=TRUE)
#mutate(DOSIPS = as.Date(DOSIPS),
#bblid = as.factor(bblid))
sips$BBLID <- as.factor(sips$BBLID)
skim(sips)
| Name | sips |
| Number of rows | 4289 |
| Number of columns | 42 |
| Key | NULL |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| Date | 1 |
| factor | 1 |
| numeric | 29 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| PROTOCOL | 7 | 1.00 | 12 | 25 | 0 | 28 | 0 |
| TYPE | 0 | 1.00 | 1 | 10 | 0 | 20 | 0 |
| RATER | 0 | 1.00 | 2 | 8 | 0 | 60 | 0 |
| SIPS_SOURCEID | 4054 | 0.05 | 7 | 18 | 0 | 235 | 0 |
| SOURCE_PROJECT | 4054 | 0.05 | 3 | 10 | 0 | 6 | 0 |
| SPD | 659 | 0.85 | 1 | 1 | 0 | 2 | 0 |
| FHPI | 964 | 0.78 | 1 | 1 | 0 | 2 | 0 |
| FIRST_DEG_SCZ | 1623 | 0.62 | 1 | 1 | 0 | 2 | 0 |
| PRODROMAL | 1254 | 0.71 | 1 | 1 | 0 | 2 | 0 |
| CONSIDER | 1254 | 0.71 | 1 | 1 | 0 | 2 | 0 |
| REVIEW | 1254 | 0.71 | 1 | 1 | 0 | 2 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DOSIPS | 0 | 1 | 1970-01-01 | 2022-04-07 | 2015-01-21 | 1919 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| BBLID | 0 | 1 | FALSE | 1789 | 109: 19, 919: 14, 196: 13, 117: 13 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| VISITNUM | 158 | 0.96 | 1.04 | 0.27 | 1.0 | 1.0 | 1.0 | 1.0 | 4.0 | ▇▁▁▁▁ |
| P1 | 6 | 1.00 | 1.26 | 1.74 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▃▁▁▁ |
| P2 | 9 | 1.00 | 1.09 | 1.60 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| P3 | 8 | 1.00 | 0.62 | 1.30 | 0.0 | 0.0 | 0.0 | 1.0 | 9.0 | ▇▂▁▁▁ |
| P4 | 11 | 1.00 | 1.10 | 1.74 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| P5 | 13 | 1.00 | 0.88 | 1.41 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| N1 | 15 | 1.00 | 1.19 | 1.64 | 0.0 | 0.0 | 1.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| N2 | 14 | 1.00 | 1.02 | 1.57 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| N3 | 19 | 1.00 | 0.88 | 1.57 | 0.0 | 0.0 | 0.0 | 1.0 | 9.0 | ▇▂▁▁▁ |
| N4 | 18 | 1.00 | 0.51 | 1.48 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | ▇▁▁▁▁ |
| N5 | 16 | 1.00 | 1.20 | 1.69 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| N6 | 19 | 1.00 | 1.30 | 1.96 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| D1 | 16 | 1.00 | 0.59 | 1.42 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | ▇▁▁▁▁ |
| D2 | 17 | 1.00 | 0.57 | 1.47 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | ▇▁▁▁▁ |
| D3 | 18 | 1.00 | 1.38 | 1.65 | 0.0 | 0.0 | 1.0 | 2.0 | 9.0 | ▇▅▁▁▁ |
| D4 | 21 | 1.00 | 0.60 | 1.46 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | ▇▁▁▁▁ |
| G1 | 20 | 1.00 | 1.17 | 1.69 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▃▁▁▁ |
| G2 | 20 | 1.00 | 1.07 | 1.77 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| G3 | 23 | 0.99 | 0.39 | 1.36 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | ▇▁▁▁▁ |
| G4 | 22 | 0.99 | 0.98 | 1.76 | 0.0 | 0.0 | 0.0 | 1.0 | 9.0 | ▇▂▁▁▁ |
| GAF_C | 569 | 0.87 | 69.31 | 32.03 | -99.0 | 55.0 | 70.0 | 85.0 | 999.0 | ▇▁▁▁▁ |
| GAF_H | 986 | 0.77 | 71.03 | 32.85 | -99.0 | 58.0 | 70.0 | 86.0 | 999.0 | ▇▁▁▁▁ |
| GAF_PCTCHG | 926 | 0.78 | -0.88 | 5.22 | -74.0 | 0.0 | 0.0 | 0.0 | 95.0 | ▁▁▇▁▁ |
| POS_345 | 1221 | 0.72 | 0.55 | 1.00 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 | ▇▁▁▁▁ |
| POS_6 | 1221 | 0.72 | 0.11 | 0.53 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | ▇▁▁▁▁ |
| NEG_3456 | 1221 | 0.72 | 0.94 | 1.40 | 0.0 | 0.0 | 0.0 | 1.0 | 6.0 | ▇▁▁▁▁ |
| DIS_3456 | 1221 | 0.72 | 0.44 | 0.81 | 0.0 | 0.0 | 0.0 | 1.0 | 4.0 | ▇▂▁▁▁ |
| GEN_3456 | 1221 | 0.72 | 0.52 | 0.90 | 0.0 | 0.0 | 0.0 | 1.0 | 4.0 | ▇▂▁▁▁ |
| POSS_PRO_DX | 3167 | 0.26 | 348.32 | 0.11 | 348.1 | 348.3 | 348.4 | 348.4 | 348.4 | ▂▂▁▂▇ |
Now for our outcome of interest, SIPS/SOPS scores. First filtering by completion.
#list sops items
plist <- c("P1", "P2", "P3", "P4", "P5")
nlist <- c("N1", "N2", "N3", "N4", "N5", "N6")
dlist <- c("D1", "D2", "D3", "D4")
glist <- c("G1", "G2", "G3", "G4")
sops.item.list <- c(plist, nlist, dlist, glist)
#drop missing sops items
sips_filt <- sips %>% drop_na(c(plist))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(plist)` instead of `plist` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
#sum(is.na(sips_filt[,c(sops.item.list)]))
length(unique(sips_filt$BBLID))
## [1] 1787
Some subjects have multiple assessments on one day (one from proband one from family, etc); need to be sure to pick only TYPE == "COMBINED" per discussion with David. If no combined, pick proband.
#if subj has more than one assessment for same day,dplyr::select Combined
sips_filt<- sips_filt %>% mutate(TYPE = as.factor(TYPE))
levels(sips_filt$TYPE)
## [1] "AP" "both" "Both" "collateral" "Collateral"
## [6] "Combined" "FC" "FP" "FPC" "GO2MI_2"
## [11] "GO2YPI_2" "IC" "IP" "IPC" "MI"
## [16] "MP" "P" "proband" "Proband" "YPI"
sips_filt$TYPE <- fct_collapse(sips_filt$TYPE, "Proband" = c("Proband", "proband", "P", "FP", "IP"), "Combined"=c("Combined", "FPC")) #combine all proband
sips_filt$TYPE <- factor(sips_filt$TYPE, levels = c("Combined", "Proband", "Collateral"))
#group data todplyr::select correct type (defined as lowest ordered level) - also cleaning up
sips_filt2 <- sips_filt %>%
group_by(BBLID, DOSIPS) %>%
arrange(TYPE) %>%
slice(c(1)) %>%
ungroup() %>%
dplyr::select(-SIPS_SOURCEID, -SOURCE_PROJECT) %>%
mutate(DOSIPS=as.Date(DOSIPS))
length(unique(sips_filt2$BBLID)) #confirm no subjects deleted
## [1] 1787
skim(sips_filt2)
| Name | sips_filt2 |
| Number of rows | 2884 |
| Number of columns | 40 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| Date | 1 |
| factor | 2 |
| numeric | 29 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| PROTOCOL | 6 | 1.00 | 12 | 25 | 0 | 27 | 0 |
| RATER | 0 | 1.00 | 2 | 8 | 0 | 60 | 0 |
| SPD | 397 | 0.86 | 1 | 1 | 0 | 2 | 0 |
| FHPI | 584 | 0.80 | 1 | 1 | 0 | 2 | 0 |
| FIRST_DEG_SCZ | 948 | 0.67 | 1 | 1 | 0 | 2 | 0 |
| PRODROMAL | 884 | 0.69 | 1 | 1 | 0 | 2 | 0 |
| CONSIDER | 884 | 0.69 | 1 | 1 | 0 | 2 | 0 |
| REVIEW | 884 | 0.69 | 1 | 1 | 0 | 2 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DOSIPS | 0 | 1 | 1970-01-01 | 2022-04-07 | 2015-03-22 | 1917 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| BBLID | 0 | 1.00 | FALSE | 1787 | 109: 8, 145: 7, 164: 7, 177: 7 |
| TYPE | 80 | 0.97 | FALSE | 3 | Pro: 1858, Com: 708, Col: 238 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| VISITNUM | 132 | 0.95 | 1.04 | 0.28 | 1.0 | 1.0 | 1.0 | 1.0 | 4.0 | ▇▁▁▁▁ |
| P1 | 0 | 1.00 | 1.36 | 1.77 | 0.0 | 0.0 | 1.0 | 2.0 | 9.0 | ▇▃▁▁▁ |
| P2 | 0 | 1.00 | 1.20 | 1.61 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| P3 | 0 | 1.00 | 0.64 | 1.27 | 0.0 | 0.0 | 0.0 | 1.0 | 9.0 | ▇▂▁▁▁ |
| P4 | 0 | 1.00 | 1.20 | 1.76 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| P5 | 0 | 1.00 | 0.95 | 1.35 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▃▁▁▁ |
| N1 | 4 | 1.00 | 1.20 | 1.62 | 0.0 | 0.0 | 1.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| N2 | 3 | 1.00 | 1.04 | 1.49 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| N3 | 4 | 1.00 | 0.90 | 1.47 | 0.0 | 0.0 | 0.0 | 1.0 | 9.0 | ▇▂▁▁▁ |
| N4 | 5 | 1.00 | 0.50 | 1.35 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | ▇▁▁▁▁ |
| N5 | 3 | 1.00 | 1.27 | 1.63 | 0.0 | 0.0 | 1.0 | 2.0 | 9.0 | ▇▃▂▁▁ |
| N6 | 5 | 1.00 | 1.33 | 1.94 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| D1 | 2 | 1.00 | 0.58 | 1.27 | 0.0 | 0.0 | 0.0 | 1.0 | 9.0 | ▇▁▁▁▁ |
| D2 | 2 | 1.00 | 0.57 | 1.34 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | ▇▁▁▁▁ |
| D3 | 4 | 1.00 | 1.43 | 1.57 | 0.0 | 0.0 | 1.0 | 3.0 | 9.0 | ▇▆▁▁▁ |
| D4 | 5 | 1.00 | 0.56 | 1.31 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | ▇▁▁▁▁ |
| G1 | 5 | 1.00 | 1.21 | 1.57 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▃▁▁▁ |
| G2 | 5 | 1.00 | 1.10 | 1.67 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| G3 | 7 | 1.00 | 0.36 | 1.17 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | ▇▁▁▁▁ |
| G4 | 7 | 1.00 | 1.05 | 1.70 | 0.0 | 0.0 | 0.0 | 2.0 | 9.0 | ▇▂▁▁▁ |
| GAF_C | 392 | 0.86 | 68.22 | 32.19 | -99.0 | 53.0 | 68.0 | 83.0 | 999.0 | ▇▁▁▁▁ |
| GAF_H | 740 | 0.74 | 70.03 | 33.34 | -99.0 | 57.0 | 70.0 | 85.0 | 999.0 | ▇▁▁▁▁ |
| GAF_PCTCHG | 658 | 0.77 | -0.96 | 5.52 | -72.0 | 0.0 | 0.0 | 0.0 | 95.0 | ▁▁▇▁▁ |
| POS_345 | 863 | 0.70 | 0.65 | 1.08 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 | ▇▁▁▁▁ |
| POS_6 | 863 | 0.70 | 0.15 | 0.61 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | ▇▁▁▁▁ |
| NEG_3456 | 863 | 0.70 | 1.03 | 1.46 | 0.0 | 0.0 | 0.0 | 2.0 | 6.0 | ▇▁▁▁▁ |
| DIS_3456 | 863 | 0.70 | 0.49 | 0.84 | 0.0 | 0.0 | 0.0 | 1.0 | 4.0 | ▇▃▁▁▁ |
| GEN_3456 | 863 | 0.70 | 0.59 | 0.93 | 0.0 | 0.0 | 0.0 | 1.0 | 4.0 | ▇▂▁▁▁ |
| POSS_PRO_DX | 2100 | 0.27 | 348.32 | 0.11 | 348.1 | 348.3 | 348.4 | 348.4 | 348.4 | ▂▁▁▂▇ |
Calculate SOPS scores by summing within each subscale (seems to be valid metric based on clinicaltrials.gov)
#calculate sops scores
sips_scored<- sips_filt2 %>% mutate(sops_p = rowSums(across(plist), na.rm=F),
sops_n = rowSums(across(nlist), na.rm=F),
sops_d = rowSums(across(dlist), na.rm=F),
sops_g = rowSums(across(glist), na.rm=F),
sops_tot = rowSums(across(c(sops_p, sops_n, sops_d, sops_g), na.rm=F)))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(nlist)` instead of `nlist` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(dlist)` instead of `dlist` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(glist)` instead of `glist` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
#also dropping `3456` cols (and maybe GAF_PCTCHG?) since they don't seem to mean anything or be useful and it would be better to recalculate them anyway
#aksi dropping rater
sips_scored <- sips_scored %>%dplyr::select(-VISITNUM, -RATER, -REVIEW, -POS_345, -POS_6, - NEG_3456, -DIS_3456, - GEN_3456) %>% as.data.frame()
head(sips_scored)
## BBLID PROTOCOL DOSIPS TYPE P1 P2 P3 P4 P5 N1 N2 N3 N4 N5
## 1 11287 804019 - NAYA Assess 2006-01-30 Proband 2 0 2 0 0 0 2 0 0 3
## 2 11636 812481 - 22q 2011-02-09 Proband 6 6 3 6 5 5 3 1 5 1
## 3 11671 804019 - NAYA Assess 2006-08-12 Proband 4 0 1 5 3 1 0 0 2 5
## 4 11671 804019 - NAYA Assess 2008-04-17 Proband 0 3 0 1 0 3 0 1 0 3
## 5 11706 812481 - 22q 2011-04-27 Proband 0 0 0 1 0 0 0 0 0 0
## 6 11936 804019 - NAYA Assess 2008-05-05 Proband 6 6 0 1 0 5 5 0 0 0
## N6 D1 D2 D3 D4 G1 G2 G3 G4 GAF_C GAF_H GAF_PCTCHG SPD FHPI FIRST_DEG_SCZ
## 1 0 0 0 1 0 0 0 0 0 85 85 0 N Y Y
## 2 2 5 2 4 1 3 5 3 4 47 47 0 N N N
## 3 6 2 0 0 2 3 0 0 1 47 45 4 N Y Y
## 4 0 0 0 0 0 0 0 0 0 63 63 0 N Y Y
## 5 0 0 0 0 0 0 0 0 0 83 83 0 N N N
## 6 6 0 2 3 0 0 6 0 3 41 55 -25 N Y N
## PRODROMAL CONSIDER POSS_PRO_DX sops_p sops_n sops_d sops_g sops_tot
## 1 N N NA 4 5 1 0 10
## 2 N Y NA 26 17 12 15 70
## 3 Y N 348.4 13 14 4 4 35
## 4 Y N 348.4 4 7 0 0 11
## 5 N N NA 1 0 0 0 1
## 6 N Y NA 13 16 5 9 43
Merging into long scanlist and filtering for scans with baseline w/in 1 yr and at least one follow-up 1yr or more later
sips_scan_long <- inner_join(all_7t.long, sips_scored, by="BBLID")
head(sips_scan_long)
## BBLID date_7t scanner SCANID PROTOCOL DOSIPS TYPE
## 1 100522 2021-12-11 Terra <NA> 833922 - EvolPsy 2021-03-30 Proband
## 2 100898 2021-11-30 Terra 11977 833922 - EvolPsy 2021-10-08 Proband
## 3 102041 2018-06-15 Terra 10821 810336 - Go3 2015-12-11 Proband
## 4 102041 2022-01-25 Terra <NA> 810336 - Go3 2015-12-11 Proband
## 5 102041 2022-03-24 Terra 12139 810336 - Go3 2015-12-11 Proband
## 6 105176 2015-02-26 ONM 9332 810336 - Go2 Supplement 2013-06-22 Proband
## P1 P2 P3 P4 P5 N1 N2 N3 N4 N5 N6 D1 D2 D3 D4 G1 G2 G3 G4 GAF_C GAF_H
## 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 NA NA
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NA NA
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 94 94
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 94 94
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 94 94
## 6 0 0 0 2 0 1 2 0 0 2 2 0 0 1 0 2 1 0 0 75 75
## GAF_PCTCHG SPD FHPI FIRST_DEG_SCZ PRODROMAL CONSIDER POSS_PRO_DX sops_p
## 1 NA N N N <NA> <NA> NA 0
## 2 NA N N N <NA> <NA> NA 0
## 3 0 N N N N N NA 0
## 4 0 N N N N N NA 0
## 5 0 N N N N N NA 0
## 6 0 N N <NA> N N NA 2
## sops_n sops_d sops_g sops_tot
## 1 0 1 0 1
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 7 1 3 13
#calc diff from each 7T scan (negative values -> dx before 7t)
sips_scan_long <- sips_scan_long %>%
mutate(sips_diff_days = as.numeric(difftime(DOSIPS,date_7t, units = "days")),
sips_diff_yrs = round((as.numeric(difftime(DOSIPS,date_7t, units = "weeks"))/52.25)/.5)*.5) %>% #time diff rounded to half-yr
rename(date7t=date_7t)
sip.pre.scan_l <- which(sips_scan_long$sips_diff_days <= 364 & sips_scan_long$sips_diff_days >= -365)
sip.post.scan_l <- which(sips_scan_long$sips_diff_days > 364)
sips_scan_long.lab <- sips_scan_long %>%
mutate(scan.time = case_when(row_number() %in% sip.pre.scan_l ~ "base",
row_number() %in% sip.post.scan_l ~ "end"),) %>% #label every sops timepoint relative to each scan
filter(!is.na(scan.time)) %>% #drop sops that don't meet criteria for base or endpoint
group_by(BBLID, date7t) %>%
arrange(abs(sips_diff_days)) %>%
mutate(row = row_number()) %>%
filter(!(scan.time=="base" & row > 1)) %>% #if subject has more than one baseline SOPS, just get closest to scan
ungroup() %>%
dplyr::select(-row)
table(as.factor(sips_scan_long.lab$scan.time))
##
## base end
## 120 58
n_unique(sips_scan_long.lab$BBLID)
## [1] 110
#drop scans with less than 2 timepoints
scanlist_clean <- sips_scan_long.lab %>%
group_by(BBLID, date7t) %>%
arrange(abs(sips_diff_days)) %>%
filter(first(scan.time)=="base" & n() >= 2) %>% #only keep BBLID-scan pairs with 2+ timepoints, the first being baseline
ungroup() %>%
group_by(BBLID, date7t, scan.time) %>%
mutate(row = row_number(),
scan.time=paste(scan.time,as.character(row), sep="."), #number endpoints if subj has more than one
observation = paste(as.character(BBLID), as.character(date7t), scan.time)) %>% #add indicator for observation
ungroup() %>%
dplyr::select(-row)
table(as.factor(scanlist_clean$scan.time))
##
## base.1 end.1 end.2 end.3 end.4 end.5
## 32 32 8 4 3 1
n_unique(scanlist_clean$BBLID)
## [1] 31
skim(scanlist_clean)
| Name | scanlist_clean |
| Number of rows | 80 |
| Number of columns | 44 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| Date | 2 |
| factor | 1 |
| numeric | 30 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| BBLID | 0 | 1.00 | 5 | 6 | 0 | 31 | 0 |
| scanner | 0 | 1.00 | 3 | 5 | 0 | 2 | 0 |
| SCANID | 0 | 1.00 | 4 | 5 | 0 | 32 | 0 |
| PROTOCOL | 0 | 1.00 | 12 | 25 | 0 | 12 | 0 |
| SPD | 10 | 0.88 | 1 | 1 | 0 | 1 | 0 |
| FHPI | 18 | 0.78 | 1 | 1 | 0 | 2 | 0 |
| FIRST_DEG_SCZ | 22 | 0.72 | 1 | 1 | 0 | 2 | 0 |
| PRODROMAL | 35 | 0.56 | 1 | 1 | 0 | 2 | 0 |
| CONSIDER | 35 | 0.56 | 1 | 1 | 0 | 2 | 0 |
| scan.time | 0 | 1.00 | 5 | 6 | 0 | 6 | 0 |
| observation | 0 | 1.00 | 22 | 24 | 0 | 80 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date7t | 0 | 1 | 2013-10-11 | 2018-12-18 | 2014-11-26 | 30 |
| DOSIPS | 0 | 1 | 2013-01-10 | 2022-02-18 | 2018-02-18 | 78 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| TYPE | 0 | 1 | FALSE | 3 | Pro: 69, Com: 10, Col: 1 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| P1 | 0 | 1.00 | 1.94 | 1.89 | 0.0 | 0.00 | 2.0 | 3.00 | 6.0 | ▇▅▁▁▂ |
| P2 | 0 | 1.00 | 1.35 | 1.54 | 0.0 | 0.00 | 1.0 | 2.00 | 6.0 | ▇▂▂▁▁ |
| P3 | 0 | 1.00 | 0.96 | 1.52 | 0.0 | 0.00 | 0.0 | 2.00 | 6.0 | ▇▁▁▁▁ |
| P4 | 0 | 1.00 | 1.50 | 1.86 | 0.0 | 0.00 | 0.0 | 3.00 | 6.0 | ▇▁▂▁▁ |
| P5 | 0 | 1.00 | 1.34 | 1.53 | 0.0 | 0.00 | 1.0 | 2.00 | 6.0 | ▇▂▂▁▁ |
| N1 | 0 | 1.00 | 1.04 | 1.26 | 0.0 | 0.00 | 1.0 | 2.00 | 5.0 | ▇▃▁▁▁ |
| N2 | 0 | 1.00 | 0.60 | 1.06 | 0.0 | 0.00 | 0.0 | 1.00 | 4.0 | ▇▂▁▁▁ |
| N3 | 0 | 1.00 | 0.80 | 1.13 | 0.0 | 0.00 | 0.0 | 1.00 | 4.0 | ▇▃▂▂▁ |
| N4 | 0 | 1.00 | 0.62 | 1.29 | 0.0 | 0.00 | 0.0 | 0.00 | 5.0 | ▇▁▁▁▁ |
| N5 | 0 | 1.00 | 0.74 | 1.13 | 0.0 | 0.00 | 0.0 | 1.00 | 4.0 | ▇▂▂▁▁ |
| N6 | 0 | 1.00 | 1.26 | 2.14 | 0.0 | 0.00 | 0.0 | 2.00 | 6.0 | ▇▁▁▁▂ |
| D1 | 0 | 1.00 | 0.50 | 1.01 | 0.0 | 0.00 | 0.0 | 1.00 | 5.0 | ▇▁▁▁▁ |
| D2 | 0 | 1.00 | 0.75 | 1.49 | 0.0 | 0.00 | 0.0 | 1.00 | 6.0 | ▇▁▁▁▁ |
| D3 | 1 | 0.99 | 0.94 | 1.26 | 0.0 | 0.00 | 0.0 | 2.00 | 4.0 | ▇▂▂▂▁ |
| D4 | 0 | 1.00 | 0.34 | 0.79 | 0.0 | 0.00 | 0.0 | 0.00 | 4.0 | ▇▁▁▁▁ |
| G1 | 0 | 1.00 | 0.89 | 1.47 | 0.0 | 0.00 | 0.0 | 2.00 | 6.0 | ▇▁▂▁▁ |
| G2 | 0 | 1.00 | 0.91 | 1.41 | 0.0 | 0.00 | 0.0 | 2.00 | 6.0 | ▇▂▁▁▁ |
| G3 | 0 | 1.00 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 0.00 | 0.0 | ▁▁▇▁▁ |
| G4 | 0 | 1.00 | 0.75 | 1.19 | 0.0 | 0.00 | 0.0 | 1.00 | 5.0 | ▇▁▁▁▁ |
| GAF_C | 20 | 0.75 | 65.17 | 18.45 | 21.0 | 50.00 | 67.5 | 80.25 | 95.0 | ▁▇▆▇▇ |
| GAF_H | 26 | 0.68 | 68.74 | 17.07 | 21.0 | 57.00 | 70.0 | 81.00 | 95.0 | ▁▅▆▇▇ |
| GAF_PCTCHG | 25 | 0.69 | -0.67 | 3.71 | -25.0 | 0.00 | 0.0 | 0.00 | 0.0 | ▁▁▁▁▇ |
| POSS_PRO_DX | 58 | 0.28 | 348.35 | 0.11 | 348.1 | 348.40 | 348.4 | 348.40 | 348.4 | ▂▁▁▁▇ |
| sops_p | 0 | 1.00 | 7.09 | 6.62 | 0.0 | 2.00 | 5.5 | 10.25 | 29.0 | ▇▅▂▁▁ |
| sops_n | 0 | 1.00 | 5.06 | 4.57 | 0.0 | 1.00 | 5.0 | 8.00 | 21.0 | ▇▆▂▁▁ |
| sops_d | 1 | 0.99 | 2.53 | 3.08 | 0.0 | 0.00 | 1.0 | 4.00 | 13.0 | ▇▃▁▁▁ |
| sops_g | 0 | 1.00 | 2.55 | 3.15 | 0.0 | 0.00 | 1.0 | 4.00 | 12.0 | ▇▂▂▁▁ |
| sops_tot | 1 | 0.99 | 17.29 | 14.90 | 0.0 | 5.50 | 14.0 | 26.50 | 74.0 | ▇▅▂▁▁ |
| sips_diff_days | 0 | 1.00 | 723.59 | 821.16 | -274.0 | -37.75 | 537.0 | 1273.75 | 2799.0 | ▇▆▃▂▂ |
| sips_diff_yrs | 0 | 1.00 | 1.99 | 2.24 | -0.5 | 0.00 | 1.5 | 3.50 | 7.5 | ▇▅▃▁▂ |
Keeping data long for now for mixed random effects model!
Adding sex, DOB/age, race (csv adapted from 7T list from David).
#load demographics (adapted from 7T list from David)
demo.phi <- read.csv("data/7t_demographics.csv", header = TRUE, stringsAsFactors = TRUE, na = c("", "..", ".")) %>%
mutate(sex = as.factor(Sex_1M),
Ethnicity = as.factor(Ethnicity),
DOB = as.Date(DOB, format = "%m/%d/%y"),
BBLID = as.factor(BBLID)) %>%
dplyr::select(-Diagnostic_Group, -sex, -Age)
sops_demo.phi <- left_join(scanlist_clean, demo.phi, by="BBLID")
#skim(sops_demo.phi)
n_unique(sops_demo.phi$observation)
## [1] 80
Demographic variables have 100% completion.
Using DOB to calculate age at scan and at endpoint SOPS, then removing.
sops_demo <- sops_demo.phi %>%
mutate(age_scan = as.numeric(difftime(date7t,DOB, units = "weeks"))/52.25,
age_sops = as.numeric(difftime(DOSIPS,DOB, units = "weeks"))/52.25) %>%
dplyr::select(-DOB)
summary(sops_demo$age_scan)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.70 16.79 19.49 19.65 21.40 26.02
summary(sops_demo$age_sops)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.44 19.11 21.46 21.63 23.67 29.42
Now merge in diagnosis and filter to smallest datediff
#get just dates/matching info for observations
just_scans <- sops_demo %>%dplyr::select(BBLID, observation, DOSIPS, date7t, scan.time)
sops_dx <- left_join(just_scans, redcap, by="BBLID")
any(is.na(sops_dx$AXIS1_DX1)) #no primary dx missing
## [1] FALSE
#calculate datediff from dx to sops, keep just row closest
sops_dx.filt <- sops_dx %>%
mutate(dx_sops_diff = as.numeric(difftime(DODIAGNOSIS,DOSIPS, units = "days"))) %>%
arrange(abs(dx_sops_diff)) %>%
group_by(observation) %>%
slice_head() %>% #keep only closest dx
ungroup() %>%
filter(abs(dx_sops_diff) <= 100) %>% #restrict to w/in 100 days
dplyr::select_if(~!all(is.na(.)))
sops_dx.filt %>%
ggplot(aes(x=dx_sops_diff, fill=scan.time)) + geom_bar() + facet_grid(cols = vars(scan.time))
#focusing on baseline
sops_dx.filt %>% filter(scan.time=="base.1") %>%
mutate(dx_to_scan_diff = as.numeric(difftime(DODIAGNOSIS,date7t, units = "days"))) %>%
ggplot(aes(x=dx_to_scan_diff)) + geom_dotplot()
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
#merge back in
n_unique(sops_dx.filt$observation)
## [1] 75
summary(sops_dx.filt$dx_sops_diff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -7.00000 0.00000 0.00000 0.06667 0.00000 8.00000
#skim(sops_change_dx)
Now merge back in
#drop stuff that's already there before merging
sops_dx.filt <- sops_dx.filt %>%
dplyr::select(-BBLID, -DOSIPS, -date7t, -scan.time)
sops_dx.final <- left_join(sops_demo, sops_dx.filt, by="observation")
#skim(sops_dx.final)
n_unique(sops_dx.final$observation)
## [1] 80
First I’ll make an indicator for psychosis/prodrome at baseline or endpoint (since HSTATUS cols aren’t really great).
sops_dx.final %>%dplyr::select(contains("AXIS1_DX")) %>%
t %>% c %>% unique
## [1] "348.4" NA "348.43" "799.90B" "348.62" "300.23" "304.3"
## [8] "298.9" "295.6" "311" "V71.09B" "296.26" "305" "314"
## [15] "348.12" "296.25" "296.2" "296.9" "348.41" "309.21" "295.4"
## [22] "V62.82" "309.81" "300.02" "295.9" "314.01" "303.9" "348.11"
## [29] "296.32" "300" "296.35" "300.3" "348.42"
control.list <- c("V71.09B")
pro.list<- c("348.4", "348.43", "348.62", "348.42", "348.12", "348.41", "348.11")
psych.list <- c("295.6", "298.9", "295.4", "295.9")
other.list <- c("300", "300.23", "311", "309.21", "304.3", "296.35", "V62.82", "309.81", "300.02", "305", "296.36", "303.9", "296.32", "314", "296.25", "296.2", "296.26", "300.3", "296.9", "314.01", "799.90B")
#making indicator for each dx category
sops_dx.sum <- sops_dx.final %>%
mutate(dx_control = if_any(starts_with("AXIS1_DX"), ~ .x %in% control.list,1,0),
dx_pro = if_any(starts_with("AXIS1_DX"), ~ .x %in% pro.list,1,0),
dx_psych = if_any(starts_with("AXIS1_DX"), ~ .x %in% psych.list,1,0),
dx_other = if_any(starts_with("AXIS1_DX"), ~ .x %in% other.list,1,0)) %>%
mutate(diagnosis = as.factor(case_when(dx_pro == TRUE ~ "pro", #single col of dx
dx_psych == TRUE ~ "psych",
dx_other == TRUE ~ "other",
dx_control == TRUE ~ "none")))
#adding a base_dx indicator, since if we're predicting outcomes we'd know this (though maybe v correlated with intercept)
sops_dx.sum <- sops_dx.sum %>%
group_by(BBLID, SCANID) %>%
mutate(base_dx=first(diagnosis),
base_gafc=first(GAF_C),
base_sops_p=first(sops_p),
delta_sops=sops_p-base_sops_p,
sops_cat = as.factor(case_when(delta_sops<0 ~ "remit",
delta_sops==0 ~ "stable",
delta_sops>0 ~ "progress")))%>%
ungroup()
sops_dx.sum %>%
ggplot(aes(y=BBLID, x=scan.time, color=diagnosis)) + geom_point()
Diagnosis amongst baseline prodromal and psychotic disorder subjects seem to be largely stable over time.
And drop all the extra dx columns we don’t need
sops_dx.sum.filt <- sops_dx.sum %>%
dplyr::select(!contains("AXIS1") & !contains("HSTATUS") & !contains("CONSIDER") & !contains("PRODROMAL") & !contains("POSS_PRO_DX"))
Load and clean med data
#load med data
meds <- read.csv("data/oracle_meds_all.csv", header = TRUE, stringsAsFactors = F, na = c("", "null"))
skim(meds)
| Name | meds |
| Number of rows | 17469 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| character | 15 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| COLLECTID | 2 | 1.00 | 1 | 41 | 0 | 5829 | 0 |
| BBLID | 23 | 1.00 | 2 | 29 | 0 | 3818 | 0 |
| PROTOCOL | 11301 | 0.35 | 12 | 79 | 0 | 48 | 0 |
| DOCOLLECT | 31 | 1.00 | 10 | 21 | 0 | 3180 | 0 |
| COLLECTBY | 2202 | 0.87 | 1 | 8 | 0 | 184 | 0 |
| MED_COLLECT | 153 | 0.99 | 1 | 1 | 0 | 2 | 0 |
| ENTBY | 112 | 0.99 | 1 | 8 | 0 | 100 | 0 |
| DOENT | 12 | 1.00 | 10 | 10 | 0 | 2809 | 0 |
| NOTES | 7225 | 0.59 | 1 | 138 | 0 | 128 | 15 |
| MEDICINE | 2576 | 0.85 | 2 | 114 | 0 | 1278 | 0 |
| PHARMCLASS | 4245 | 0.76 | 6 | 1030 | 0 | 276 | 32 |
| DOMED_START | 3360 | 0.81 | 10 | 21 | 0 | 4470 | 0 |
| DOMED_END | 6620 | 0.62 | 10 | 21 | 0 | 4435 | 0 |
| IS_CURRENT | 2649 | 0.85 | 1 | 1 | 0 | 3 | 0 |
| COMMENTS | 15786 | 0.10 | 1 | 252 | 0 | 1087 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| VISITNUM | 13651 | 0.22 | 1.63 | 17.49 | 0 | 1.00 | 1 | 1.00 | 900 | ▇▁▁▁▁ |
| DOSAGE | 4553 | 0.74 | 494.13 | 1833.23 | 0 | 4.00 | 20 | 150.00 | 24000 | ▇▁▁▁▁ |
| ROUTE | 4758 | 0.73 | 1.05 | 0.37 | 0 | 1.00 | 1 | 1.00 | 9 | ▇▁▁▁▁ |
| DURATION | 6628 | 0.62 | 14.07 | 35.23 | -9 | 0.00 | 2 | 12.00 | 481 | ▇▁▁▁▁ |
| MEDICINEID | 2581 | 0.85 | 7780.22 | 4662.10 | 1 | 3740.75 | 7476 | 11900.25 | 15986 | ▇▇▇▇▇ |
#each med is it's own row, so need to rotate wide
meds <- meds %>%
group_by(BBLID, DOCOLLECT) %>%
mutate(ID = paste(as.character(BBLID), as.character(DOCOLLECT)), #col to ID subj+timepoint
count = row_number(), #count meds entered
DOCOLLECT= as.Date(DOCOLLECT)) %>%
ungroup()
hist(meds$count)
medswide <- meds %>%
dplyr::select(BBLID, DOCOLLECT, ID, count, MEDICINE, NOTES, IS_CURRENT) %>%
mutate(BBLID=as.factor(BBLID)) %>%
pivot_wider(., names_from = count, values_from=c(MEDICINE, NOTES, IS_CURRENT)) # pivot
#get subset
meds_reduced <- meds %>%
dplyr::select(BBLID, DOCOLLECT, MED_COLLECT, NOTES, MEDICINE, PHARMCLASS, DOMED_START, DOMED_END, IS_CURRENT)
Trying to add PNC data to see if that will help get more baseline info
pnc.meds <- read.csv("data/n1601_health_with_meds_20170421.csv", header = TRUE, stringsAsFactors = F, na ="") %>%
mutate(BBLID=as.factor(bblid)) %>%
dplyr::select(-bblid)
skim(pnc.meds)
| Name | pnc.meds |
| Number of rows | 1601 |
| Number of columns | 30 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| factor | 1 |
| numeric | 18 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| med1 | 726 | 0.55 | 3 | 27 | 0 | 233 | 0 |
| med2 | 1081 | 0.32 | 3 | 27 | 0 | 178 | 0 |
| med3 | 1295 | 0.19 | 3 | 29 | 0 | 127 | 0 |
| med4 | 1424 | 0.11 | 3 | 29 | 0 | 92 | 0 |
| med5 | 1503 | 0.06 | 4 | 26 | 0 | 62 | 0 |
| med6 | 1542 | 0.04 | 6 | 26 | 0 | 46 | 0 |
| med7 | 1564 | 0.02 | 3 | 35 | 0 | 26 | 0 |
| med8 | 1586 | 0.01 | 3 | 13 | 0 | 14 | 0 |
| med9 | 1591 | 0.01 | 6 | 13 | 0 | 9 | 0 |
| med10 | 1595 | 0.00 | 4 | 15 | 0 | 6 | 0 |
| med11 | 1597 | 0.00 | 6 | 10 | 0 | 4 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| BBLID | 0 | 1 | FALSE | 1601 | 800: 1, 801: 1, 801: 1, 802: 1 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| scanid | 0 | 1 | 5182.26 | 1436.06 | 2632 | 4004 | 5148 | 6215 | 8470 | ▆▇▇▆▃ |
| smrytrt_psychinpt | 0 | 1 | 0.04 | 0.19 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| psychoactiveMedPsychv2 | 0 | 1 | 0.11 | 0.32 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| psychoactiveMedMedicalv2 | 0 | 1 | 0.04 | 0.20 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| incidentalFindingExclude | 0 | 1 | 0.01 | 0.11 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| medicalratingExcludev1 | 0 | 1 | 0.05 | 0.22 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| healthExcludev2 | 0 | 1 | 0.10 | 0.29 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| ltnExcludev2 | 0 | 1 | 0.21 | 0.41 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▂ |
| squeakycleanExclude | 0 | 1 | 0.74 | 0.44 | 0 | 0 | 1 | 1 | 1 | ▃▁▁▁▇ |
| psychoactiveMedMedical | 0 | 1 | 0.04 | 0.20 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| medclass_Antipsychotic | 0 | 1 | 0.02 | 0.13 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| medclass_Anticonvulsant | 0 | 1 | 0.01 | 0.11 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| medclass_Antidepressant | 0 | 1 | 0.03 | 0.17 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| medclass_Benzodiazepine | 0 | 1 | 0.00 | 0.07 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| medclass_Stimulant | 0 | 1 | 0.07 | 0.26 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| medclass_NonstimulantADHDmed | 0 | 1 | 0.02 | 0.13 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| medclass_Other | 0 | 1 | 0.00 | 0.04 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| medclass_Lithium | 0 | 1 | 0.00 | 0.00 | 0 | 0 | 0 | 0 | 0 | ▁▁▇▁▁ |
#get managable subset of scan list
scan_dates <- sops_dx.sum.filt %>%
dplyr::select(BBLID, date7t, observation) %>%
distinct()
pnc.meds_scan_date <- inner_join(scan_dates, pnc.meds, by="BBLID") %>%
distinct(observation, .keep_all = T)
n_unique(pnc.meds_scan_date$BBLID)
## [1] 27
Inferring based on PNC data collection window that all meds are within study range, will add into lifetime calculations.
Lets go back and look at lifetime:
#merge with scan lists
med_dates <- inner_join(just_scans, meds_reduced, by="BBLID")
n_unique(med_dates$observation)
## [1] 80
med_lifetime <- med_dates %>%
group_by(observation) %>%
mutate(meds_sops_diff=as.numeric(difftime(DOCOLLECT, DOSIPS, units = "days"))) %>% #DOCOLLECT - DOSIPS, so if + then DOCOLLECT is after SOPS
filter((is.na(DOMED_START) & meds_sops_diff <= 100) | DOSIPS >= DOMED_START) %>% #keep if med is STARTED BEFORE SOPS (or start date unknown but collect is no later than 100 days after)
mutate(count = row_number()) %>% #add counts so i can pivot after
ungroup()
med_lifetime$MEDICINE[is.na(med_lifetime$MEDICINE)] <- "empty entry"
skim(med_lifetime)
| Name | med_lifetime |
| Number of rows | 383 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| Date | 3 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| BBLID | 0 | 1.00 | 5 | 6 | 0 | 31 | 0 |
| observation | 0 | 1.00 | 22 | 24 | 0 | 77 | 0 |
| scan.time | 0 | 1.00 | 5 | 6 | 0 | 6 | 0 |
| MED_COLLECT | 8 | 0.98 | 1 | 1 | 0 | 2 | 0 |
| NOTES | 365 | 0.05 | 1 | 138 | 0 | 7 | 3 |
| MEDICINE | 0 | 1.00 | 4 | 37 | 0 | 74 | 0 |
| PHARMCLASS | 249 | 0.35 | 14 | 332 | 0 | 29 | 0 |
| DOMED_START | 227 | 0.41 | 10 | 10 | 0 | 47 | 0 |
| DOMED_END | 333 | 0.13 | 10 | 10 | 0 | 20 | 0 |
| IS_CURRENT | 164 | 0.57 | 1 | 1 | 0 | 3 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DOSIPS | 0 | 1 | 2014-01-08 | 2022-02-18 | 2019-01-29 | 75 |
| date7t | 0 | 1 | 2013-10-11 | 2018-12-18 | 2015-08-26 | 30 |
| DOCOLLECT | 0 | 1 | 2013-11-14 | 2022-03-08 | 2018-03-27 | 133 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| meds_sops_diff | 0 | 1 | -492.30 | 806.47 | -2600 | -1016 | -364 | 0 | 2198 | ▂▃▇▁▁ |
| count | 0 | 1 | 6.06 | 6.38 | 1 | 2 | 4 | 8 | 31 | ▇▂▁▁▁ |
#trying to find distinct meds for each observation
med_lifetime_dist <- med_lifetime %>%
group_by(observation) %>%
distinct(MEDICINE, .keep_all = T) %>% #keep distinct
mutate(count=row_number()) %>% #update counts so i can pivot after
ungroup() %>%
dplyr::select(BBLID, DOSIPS, observation, MEDICINE, NOTES, count)
table(med_lifetime_dist$MEDICINE)
##
## LEVONORGESTREL AND ETHINYL ESTRADIOL ABILIFY
## 1 6
## ADDERALL ADDERALL XR
## 4 1
## ADVAIR ALBUTERAL
## 4 2
## ALBUTEROL ALLERGY MEDS OTC
## 7 3
## AVIANE BENADRYL
## 2 2
## BIRTH CONTROL BIRTH CONTROL
## 2 6
## BUSPIRONE BUTEROL
## 1 2
## CALCIUM SUPPLEMENT CARVEDILOL
## 2 1
## CIMBACORT CLONIDINE
## 2 2
## COGENTIN CONCERTA
## 2 2
## DDAVP DEXEDRINE
## 4 2
## DEXMETHYLPHENIDATE HYDROCHLORIDE DICLAFONEC
## 1 2
## DICLOFENAC empty entry
## 2 63
## FERROUS SULFATE FLONASE
## 2 8
## FLOVENT IBUPROFEN
## 7 2
## IRON ISENTRESS
## 2 2
## LISINOPRIL LISINOPRIL AND HYDROCHLOROTHIAZIDE
## 2 1
## LOESTRIN 1.5/30 LORATADINE
## 4 2
## LORATADINE ALLERGY RELIEF LORATIDINE
## 2 2
## MACA ROOT MANE CHOICE
## 2 1
## MELATONIN MICROGESTIN
## 1 4
## MOTRIN MULTI VITAMIN AND MINERAL
## 5 2
## MULTIVITAMIN MULTIVITAMIN WITH FLUORIDE
## 2 2
## MUSCLE RELAXERS NEXAPLANON
## 2 2
## NONE NORETHINDRONE ACETATE
## 3 4
## ORTHO TRI CYCLEN LO OXYCODONE
## 2 2
## OXYCODONE HYDROCHLORIDE AND ASPIRIN PROMETHAZINE VC
## 1 1
## PROZAC RANITIDINE
## 4 2
## RATIDINE RISPERIDONE
## 2 2
## RISPERIDONE ORAL DISINTEGRATE TABLETS SEROQUEL
## 2 2
## SPRINTEC SRONYX
## 3 4
## SYMBICORT TENEX
## 2 2
## TRILOMARZIA TRUVADA
## 2 2
## TYLENOL VITAMIN D
## 5 1
## VYVANSE WELLBUTRIN
## 2 2
## XANAX ZOLOFT
## 2 2
## ZYRTEC ZYRTEC-D
## 6 3
#pivot
med_lifetime_dist.wide <- pivot_wider(med_lifetime_dist, id_cols=observation, names_from = count, values_from=c(MEDICINE, NOTES)) %>% # pivot
dplyr::select_if(~!all(is.na(.))) #drop any col that's totally empty
n_unique(med_lifetime_dist.wide$observation)
## [1] 77
n_unique(med_lifetime_dist$BBLID)
## [1] 31
If an individual only has empty entry value then we can infer that they are not taking any medications. Restricting to either date of collection no later than 100 days post-SOPS OR medicaiton being noted as starting before the SOPS date, there’s med info for all subjects but not for all observations.
Integrate PNC meds:
pnc.meds.only <- pnc.meds_scan_date %>%
dplyr::select(observation, 22:32)
pnc.meds.only$med1[is.na(pnc.meds.only$med1)] <- "empty entry"
med_lifetime_comp <- full_join(med_lifetime_dist.wide, pnc.meds.only, by="observation")
#check if any observations are still missing med info:
med_lifetime_comp %>% filter(is.na(MEDICINE_1) & is.na(med1)) %>% nrow
## [1] 0
Yay, no observations are unaccounted for. Save out and manually code for psych rx:
write.csv(med_lifetime_comp, "data/med_lifetime_comp.csv", row.names=FALSE, na="")
Read back in and merge coded lifetime meds:
meds.coded <- read.csv("data/med_lifetime_comp.CODED.csv", header = TRUE, stringsAsFactors = F, na="") %>%
dplyr::select(observation, psych.rx_life) %>%
mutate(psych.rx_life=as.factor(psych.rx_life))
sops_change_meds.life <- left_join(sops_dx.sum.filt, meds.coded, by="observation")
sops_change_meds.life <- sops_change_meds.life %>%
group_by(BBLID, SCANID) %>%
mutate(base_rx=first(psych.rx_life)) %>%
ungroup()
n_unique(sops_change_meds.life$observation)
## [1] 80
#skim(sops_change_meds.life)
Read in processed CEST values from Harvard Oxford Subcortical Atlas ROIs:
cest.rois <- read.csv("data/GluCEST-HarvardOxford-Subcortical-Measures.csv", header = TRUE, stringsAsFactors = F, sep = '\t', na="NaN")
skim(cest.rois)
| Name | cest.rois |
| Number of rows | 26 |
| Number of columns | 64 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| logical | 27 |
| numeric | 36 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Subject | 0 | 1 | 86 | 90 | 0 | 26 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| L_Caudate_mean | 26 | 0 | NaN | : |
| L_Caudate_numvoxels | 26 | 0 | NaN | : |
| L_Caudate_SD | 26 | 0 | NaN | : |
| L_Putamen_mean | 26 | 0 | NaN | : |
| L_Putamen_numvoxels | 26 | 0 | NaN | : |
| L_Putamen_SD | 26 | 0 | NaN | : |
| L_Pallidum_mean | 26 | 0 | NaN | : |
| L_Pallidum_numvoxels | 26 | 0 | NaN | : |
| L_Pallidum_SD | 26 | 0 | NaN | : |
| L_Hipp_mean | 26 | 0 | NaN | : |
| L_Hipp_numvoxels | 26 | 0 | NaN | : |
| L_Hipp_SD | 26 | 0 | NaN | : |
| L_Amygdala_mean | 26 | 0 | NaN | : |
| L_Amygdala_numvoxels | 26 | 0 | NaN | : |
| L_Amygdala_SD | 26 | 0 | NaN | : |
| L_Accumbens_mean | 26 | 0 | NaN | : |
| L_Accumbens_numvoxels | 26 | 0 | NaN | : |
| L_Accumbens_SD | 26 | 0 | NaN | : |
| R_Putamen_mean | 26 | 0 | NaN | : |
| R_Putamen_numvoxels | 26 | 0 | NaN | : |
| R_Putamen_SD | 26 | 0 | NaN | : |
| R_Pallidum_mean | 26 | 0 | NaN | : |
| R_Pallidum_numvoxels | 26 | 0 | NaN | : |
| R_Pallidum_SD | 26 | 0 | NaN | : |
| R_Amygdala_mean | 26 | 0 | NaN | : |
| R_Amygdala_numvoxels | 26 | 0 | NaN | : |
| R_Amygdala_SD | 26 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| L_CerebralWM_mean | 25 | 0.04 | 2.080000e+00 | NA | 2.08 | 2.08 | 2.08 | 2.08 | 2.080e+00 | ▁▁▇▁▁ |
| L_CerebralWM_numvoxels | 25 | 0.04 | 3.450000e+02 | NA | 345.00 | 345.00 | 345.00 | 345.00 | 3.450e+02 | ▁▁▇▁▁ |
| L_CerebralWM_SD | 25 | 0.04 | 5.480000e+00 | NA | 5.48 | 5.48 | 5.48 | 5.48 | 5.480e+00 | ▁▁▇▁▁ |
| L_CerebralCortex_mean | 17 | 0.35 | 7.710000e+00 | 3.380000e+00 | 1.93 | 6.52 | 6.90 | 8.19 | 1.280e+01 | ▂▂▇▁▃ |
| L_CerebralCortex_numvoxels | 17 | 0.35 | 4.086700e+02 | 2.975700e+02 | 5.00 | 201.00 | 327.00 | 603.00 | 8.500e+02 | ▅▇▂▂▅ |
| L_CerebralCortex_SD | 17 | 0.35 | 2.330000e+00 | 1.010000e+00 | 0.90 | 1.69 | 2.09 | 3.17 | 3.870e+00 | ▇▇▇▃▇ |
| L_LatVent_mean | 25 | 0.04 | 3.890000e+00 | NA | 3.89 | 3.89 | 3.89 | 3.89 | 3.890e+00 | ▁▁▇▁▁ |
| L_LatVent_numvoxels | 25 | 0.04 | 4.100000e+01 | NA | 41.00 | 41.00 | 41.00 | 41.00 | 4.100e+01 | ▁▁▇▁▁ |
| L_LatVent_SD | 25 | 0.04 | 1.730000e+00 | NA | 1.73 | 1.73 | 1.73 | 1.73 | 1.730e+00 | ▁▁▇▁▁ |
| L_Thalamus_mean | 25 | 0.04 | 8.260000e+00 | NA | 8.26 | 8.26 | 8.26 | 8.26 | 8.260e+00 | ▁▁▇▁▁ |
| L_Thalamus_numvoxels | 25 | 0.04 | 6.800000e+01 | NA | 68.00 | 68.00 | 68.00 | 68.00 | 6.800e+01 | ▁▁▇▁▁ |
| L_Thalamus_SD | 25 | 0.04 | 2.760000e+00 | NA | 2.76 | 2.76 | 2.76 | 2.76 | 2.760e+00 | ▁▁▇▁▁ |
| BrainStem_mean | 0 | 1.00 | 6.700000e+00 | 2.050000e+00 | 2.86 | 5.40 | 6.33 | 8.64 | 1.052e+01 | ▃▇▅▇▃ |
| BrainStem_numvoxels | 0 | 1.00 | 6.918800e+02 | 2.264900e+02 | 284.00 | 584.50 | 657.50 | 881.75 | 1.000e+03 | ▃▁▆▂▇ |
| Brainstem_SD | 0 | 1.00 | 3.020000e+00 | 1.140000e+00 | 1.44 | 1.96 | 2.97 | 3.83 | 5.560e+00 | ▇▅▅▅▁ |
| R_CerebralWM_mean | 0 | 1.00 | 5.280000e+00 | 1.280000e+00 | 1.23 | 4.80 | 5.47 | 5.95 | 7.280e+00 | ▁▁▃▇▃ |
| R_CerebralWM_numvoxels | 0 | 1.00 | 1.121420e+03 | 5.192600e+02 | 373.00 | 637.25 | 989.00 | 1584.75 | 2.053e+03 | ▇▆▃▃▅ |
| R_CerebralWM_SD | 0 | 1.00 | 2.750000e+00 | 8.100000e-01 | 1.53 | 2.06 | 2.77 | 3.33 | 4.350e+00 | ▇▂▇▃▂ |
| R_CerebralCortex_mean | 0 | 1.00 | 6.620000e+00 | 1.740000e+00 | 1.83 | 5.97 | 6.33 | 8.14 | 8.870e+00 | ▂▁▃▇▇ |
| R_CerebralCortex_numvoxels | 0 | 1.00 | 5.082150e+03 | 1.513020e+03 | 2881.00 | 3937.50 | 4775.00 | 6032.25 | 8.512e+03 | ▇▇▃▃▃ |
| R_CerebralCortex_SD | 0 | 1.00 | 2.160000e+00 | 6.300000e-01 | 1.31 | 1.71 | 2.00 | 2.53 | 3.560e+00 | ▇▇▇▁▃ |
| R_LatVent_mean | 0 | 1.00 | 3.750000e+00 | 1.780000e+00 | -1.31 | 2.98 | 3.74 | 4.90 | 6.760e+00 | ▁▁▅▇▂ |
| R_LatVent_numvoxels | 0 | 1.00 | 5.423000e+01 | 3.925000e+01 | 0.00 | 29.00 | 43.50 | 80.00 | 1.680e+02 | ▇▇▅▂▁ |
| R_LatVent_S | 0 | 1.00 | 2.310000e+00 | 8.500000e-01 | 0.00 | 1.84 | 2.37 | 2.92 | 3.800e+00 | ▂▁▇▇▃ |
| R_Thalamus_mean | 0 | 1.00 | 6.980000e+00 | 1.940000e+00 | 2.31 | 6.03 | 6.85 | 8.04 | 1.101e+01 | ▁▃▇▅▂ |
| R_Thalamus_numvoxels | 0 | 1.00 | 2.887700e+02 | 1.276000e+02 | 2.00 | 228.25 | 293.00 | 385.75 | 4.840e+02 | ▃▂▇▆▇ |
| R_Thalamus_SD | 0 | 1.00 | 2.120000e+00 | 6.800000e-01 | 0.71 | 1.73 | 2.06 | 2.56 | 3.500e+00 | ▂▇▆▇▂ |
| R_Caudate_mean | 13 | 0.50 | 5.230000e+00 | 2.720000e+00 | 0.00 | 4.47 | 5.72 | 7.38 | 8.400e+00 | ▃▁▆▇▇ |
| R_Caudate_numvoxels | 13 | 0.50 | 6.238000e+01 | 5.778000e+01 | 0.00 | 17.00 | 63.00 | 85.00 | 1.950e+02 | ▇▅▅▂▂ |
| R_Caudate_SD | 13 | 0.50 | 7.692308e+28 | 2.773501e+29 | 0.00 | 1.03 | 1.68 | 2.03 | 1.000e+30 | ▇▁▁▁▁ |
| R_Hipp_mean | 25 | 0.04 | 0.000000e+00 | NA | 0.00 | 0.00 | 0.00 | 0.00 | 0.000e+00 | ▁▁▇▁▁ |
| R_Hipp_numvoxels | 25 | 0.04 | 0.000000e+00 | NA | 0.00 | 0.00 | 0.00 | 0.00 | 0.000e+00 | ▁▁▇▁▁ |
| R_Hipp_SD | 25 | 0.04 | 0.000000e+00 | NA | 0.00 | 0.00 | 0.00 | 0.00 | 0.000e+00 | ▁▁▇▁▁ |
| R_Accumbens_mean | 9 | 0.65 | 6.640000e+00 | 2.160000e+00 | 1.63 | 5.94 | 6.84 | 7.81 | 1.081e+01 | ▁▂▇▅▃ |
| R_Accumbens_numvoxels | 9 | 0.65 | 2.906000e+01 | 2.147000e+01 | 1.00 | 9.00 | 32.00 | 43.00 | 7.000e+01 | ▇▁▅▃▂ |
| R_Accumbens_SD | 9 | 0.65 | 5.882353e+28 | 2.425356e+29 | 0.00 | 0.78 | 1.09 | 1.53 | 1.000e+30 | ▇▁▁▁▁ |
#clean up a bit
cest.vals <- cest.rois %>%
dplyr::select_if(~!all(is.na(.))) %>% #drop any col that's totally empty
mutate(scanid = str_split(as.character(Subject), "/", simplify = T)[, 6]) %>% #get BBLID_scanID
dplyr::select(-Subject)
#GM voxels
cest.gm.rois <- read.csv("data/GM-HarvardOxford-Subcortical-Measures.csv", header = TRUE, stringsAsFactors = F, sep = '\t', na="NaN")
#clean up a bit
gm.vals <- cest.gm.rois %>%
dplyr::select_if(~!all(is.na(.))) %>% #drop any col that's totally empty
mutate(scanid = str_split(as.character(Subject), "/", simplify = T)[, 6]) %>% #get BBLID_scanID
dplyr::select(-Subject) %>%
rename_with(.cols = 1:(ncol(.)-1), function(x){paste0("gm.", x)}) #add prefix
ALL SCANS HAVE RIGHT THALAMUS DATA!
one subject also weirdly has GM values in L_CerebralWM_mean, outputting scan ID so I can look at the niftis
gm.vals %>% filter(!is.na(gm.L_CerebralWM_numvoxels)) %>%dplyr::select(scanid, gm.L_CerebralWM_numvoxels)
## scanid gm.L_CerebralWM_numvoxels
## 1 94276_10927 106
Looks like slab was placed across the midline, seeing if other scans have comparable gm voxels in R_CerebralWM_mean as comparison.
gm.vals %>% filter(!is.na(gm.R_CerebralWM_numvoxels)) %>%dplyr::select(scanid, gm.R_CerebralWM_numvoxels)
## scanid gm.R_CerebralWM_numvoxels
## 1 105176_9332 384
## 2 106573_8900 720
## 3 112126_8979 906
## 4 117397_10686 582
## 5 120217_9029 694
## 6 121000_8859 583
## 7 121407_8987 547
## 8 125073_10972 293
## 9 132179_10918 151
## 10 132179_9726 199
## 11 139272_8985 257
## 12 18093_8924 783
## 13 19790_10934 333
## 14 83835_8777 300
## 15 85369_9132 542
## 16 87225_9459 411
## 17 91919_10683 327
## 18 91962_11090 315
## 19 92886_9087 483
## 20 93242_9146 516
## 21 93274_10992 297
## 22 94028_9203 1029
## 23 94276_10927 249
## 24 96659_11096 256
## 25 97994_11114 215
## 26 99949_9031 284
Seems like having errant GM voxels (calculated probablistically via FAST) isn’t uncommon in WM (as defined by HO Subcortical atlas), what makes 94276_10927 unique is slab orientation.
Merging cest values, remove wonky mOFC data and filter to thalamic ROI
all.cest <- merge(cest.vals, gm.vals, by="scanid") %>%
filter(!scanid %in% c("105176_9332", "132179_9726", "92886_9087", "87225_9459", "94028_9203"))
sum(duplicated(all.cest$BBLID)) #no duplicate subjects remain
## [1] 0
r.thal.dat <- all.cest %>%dplyr::select("scanid" | contains("R_Thalamus"))
skim(r.thal.dat)
| Name | r.thal.dat |
| Number of rows | 21 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| scanid | 0 | 1 | 10 | 12 | 0 | 21 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| R_Thalamus_mean | 0 | 1 | 7.48 | 1.67 | 5.14 | 6.07 | 7.12 | 8.10 | 11.01 | ▇▇▇▂▂ |
| R_Thalamus_numvoxels | 0 | 1 | 319.38 | 111.65 | 80.00 | 242.00 | 340.00 | 418.00 | 484.00 | ▂▃▇▇▇ |
| R_Thalamus_SD | 0 | 1 | 2.27 | 0.58 | 1.45 | 1.79 | 2.28 | 2.61 | 3.50 | ▇▁▆▂▂ |
| gm.R_Thalamus_mean | 0 | 1 | 0.62 | 0.10 | 0.40 | 0.56 | 0.62 | 0.66 | 0.86 | ▁▃▇▃▁ |
| gm.R_Thalamus_numvoxels | 0 | 1 | 223.14 | 71.34 | 126.00 | 160.00 | 221.00 | 300.00 | 341.00 | ▇▅▇▁▇ |
| gm.R_Thalamus_SD | 0 | 1 | 0.35 | 0.03 | 0.23 | 0.35 | 0.35 | 0.37 | 0.39 | ▁▁▂▇▅ |
r.thal.dat %>%dplyr::select(-scanid) %>% ggpairs()
#check for correlation of thalamic volume captured and avg. CEST
cor.test(r.thal.dat$R_Thalamus_mean, r.thal.dat$R_Thalamus_numvoxels, method=c("pearson"))
##
## Pearson's product-moment correlation
##
## data: r.thal.dat$R_Thalamus_mean and r.thal.dat$R_Thalamus_numvoxels
## t = 1.794, df = 19, p-value = 0.08875
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06113673 0.69765788
## sample estimates:
## cor
## 0.3805947
Merging into other FINAL DATASET
clin.to.merge <- sops_change_meds.life %>%
mutate(scanid=paste(BBLID, SCANID, sep="_"))
final.df <- right_join(clin.to.merge, r.thal.dat, by="scanid")
n_unique(final.df$observation)
## [1] 48
n_unique(final.df$BBLID)
## [1] 21
#scaled dataframe
final.df_scaled <- final.df %>% mutate_if(is.numeric, scale)
#baseline only
final.base <- final.df %>% filter(scan.time=="base.1") %>%
mutate(sex=recode_factor(Sex_1M, `1`="Male", `2`="Female"))
#followups only
final.end <- final.df %>% filter(scan.time != "base.1")
#followups only - scaled
final.end_scaled <- final.end %>% mutate_if(is.numeric, scale)
Plotting correlations
skim(final.df)
| Name | final.df |
| Number of rows | 48 |
| Number of columns | 68 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| Date | 3 |
| factor | 8 |
| logical | 4 |
| numeric | 42 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| BBLID | 0 | 1.00 | 5 | 6 | 0 | 21 | 0 |
| scanner | 0 | 1.00 | 3 | 5 | 0 | 2 | 0 |
| SCANID | 0 | 1.00 | 4 | 5 | 0 | 21 | 0 |
| PROTOCOL | 0 | 1.00 | 12 | 25 | 0 | 11 | 0 |
| SPD | 8 | 0.83 | 1 | 1 | 0 | 1 | 0 |
| FHPI | 14 | 0.71 | 1 | 1 | 0 | 2 | 0 |
| FIRST_DEG_SCZ | 16 | 0.67 | 1 | 1 | 0 | 2 | 0 |
| scan.time | 0 | 1.00 | 5 | 6 | 0 | 5 | 0 |
| observation | 0 | 1.00 | 22 | 24 | 0 | 48 | 0 |
| AGEONSET_CLINICRISK | 38 | 0.21 | 1 | 2 | 0 | 10 | 0 |
| scanid | 0 | 1.00 | 10 | 12 | 0 | 21 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date7t | 0 | 1.00 | 2014-03-07 | 2018-12-18 | 2014-10-27 | 20 |
| DOSIPS | 0 | 1.00 | 2014-01-08 | 2022-02-18 | 2018-06-10 | 48 |
| DODIAGNOSIS | 2 | 0.96 | 2014-01-08 | 2022-02-18 | 2018-06-10 | 46 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| TYPE | 0 | 1.00 | FALSE | 2 | Pro: 43, Com: 5, Col: 0 |
| Race | 0 | 1.00 | FALSE | 4 | Bla: 33, Whi: 11, Asi: 2, Mix: 2 |
| Ethnicity | 0 | 1.00 | FALSE | 1 | 2: 48, 1: 0 |
| diagnosis | 2 | 0.96 | FALSE | 4 | pro: 24, psy: 10, non: 9, oth: 3 |
| base_dx | 0 | 1.00 | FALSE | 4 | pro: 19, non: 16, psy: 11, oth: 2 |
| sops_cat | 0 | 1.00 | FALSE | 3 | sta: 25, pro: 13, rem: 10 |
| psych.rx_life | 0 | 1.00 | FALSE | 2 | 0: 39, 1: 9 |
| base_rx | 0 | 1.00 | FALSE | 2 | 0: 40, 1: 8 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| dx_control | 0 | 1 | 0.19 | FAL: 39, TRU: 9 |
| dx_pro | 0 | 1 | 0.50 | FAL: 24, TRU: 24 |
| dx_psych | 0 | 1 | 0.21 | FAL: 38, TRU: 10 |
| dx_other | 0 | 1 | 0.33 | FAL: 32, TRU: 16 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| P1 | 0 | 1.00 | 2.02 | 2.03 | 0.00 | 0.00 | 2.00 | 3.25 | 6.00 | ▇▅▁▁▃ |
| P2 | 0 | 1.00 | 1.38 | 1.65 | 0.00 | 0.00 | 1.00 | 2.00 | 6.00 | ▇▂▁▁▁ |
| P3 | 0 | 1.00 | 0.83 | 1.40 | 0.00 | 0.00 | 0.00 | 1.25 | 6.00 | ▇▁▁▁▁ |
| P4 | 0 | 1.00 | 1.52 | 1.97 | 0.00 | 0.00 | 0.00 | 3.00 | 6.00 | ▇▁▂▁▁ |
| P5 | 0 | 1.00 | 1.23 | 1.64 | 0.00 | 0.00 | 0.00 | 2.00 | 6.00 | ▇▁▂▁▁ |
| N1 | 0 | 1.00 | 0.94 | 1.17 | 0.00 | 0.00 | 1.00 | 2.00 | 5.00 | ▇▂▁▁▁ |
| N2 | 0 | 1.00 | 0.65 | 1.10 | 0.00 | 0.00 | 0.00 | 1.00 | 4.00 | ▇▁▁▁▁ |
| N3 | 0 | 1.00 | 0.60 | 0.92 | 0.00 | 0.00 | 0.00 | 1.00 | 3.00 | ▇▂▁▁▁ |
| N4 | 0 | 1.00 | 0.65 | 1.38 | 0.00 | 0.00 | 0.00 | 0.00 | 5.00 | ▇▁▁▁▁ |
| N5 | 0 | 1.00 | 0.50 | 1.03 | 0.00 | 0.00 | 0.00 | 0.00 | 4.00 | ▇▁▁▁▁ |
| N6 | 0 | 1.00 | 1.15 | 2.16 | 0.00 | 0.00 | 0.00 | 1.00 | 6.00 | ▇▁▁▁▂ |
| D1 | 0 | 1.00 | 0.50 | 1.09 | 0.00 | 0.00 | 0.00 | 0.00 | 5.00 | ▇▁▁▁▁ |
| D2 | 0 | 1.00 | 0.75 | 1.59 | 0.00 | 0.00 | 0.00 | 0.00 | 6.00 | ▇▁▁▁▁ |
| D3 | 1 | 0.98 | 0.85 | 1.20 | 0.00 | 0.00 | 0.00 | 2.00 | 4.00 | ▇▂▂▂▁ |
| D4 | 0 | 1.00 | 0.27 | 0.74 | 0.00 | 0.00 | 0.00 | 0.00 | 4.00 | ▇▁▁▁▁ |
| G1 | 0 | 1.00 | 0.96 | 1.61 | 0.00 | 0.00 | 0.00 | 1.25 | 6.00 | ▇▁▂▁▁ |
| G2 | 0 | 1.00 | 0.98 | 1.60 | 0.00 | 0.00 | 0.00 | 2.00 | 6.00 | ▇▂▁▁▁ |
| G3 | 0 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| G4 | 0 | 1.00 | 0.85 | 1.27 | 0.00 | 0.00 | 0.00 | 1.00 | 5.00 | ▇▁▁▁▁ |
| GAF_C | 15 | 0.69 | 65.79 | 21.25 | 21.00 | 50.00 | 71.00 | 84.00 | 94.00 | ▁▅▂▃▇ |
| GAF_H | 21 | 0.56 | 69.04 | 19.43 | 21.00 | 54.00 | 71.00 | 84.00 | 94.00 | ▁▃▃▂▇ |
| GAF_PCTCHG | 21 | 0.56 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| sops_p | 0 | 1.00 | 6.98 | 7.21 | 0.00 | 1.75 | 5.00 | 10.50 | 29.00 | ▇▂▂▁▁ |
| sops_n | 0 | 1.00 | 4.48 | 4.53 | 0.00 | 1.00 | 3.50 | 7.25 | 21.00 | ▇▅▁▁▁ |
| sops_d | 1 | 0.98 | 2.38 | 3.17 | 0.00 | 0.00 | 1.00 | 3.50 | 13.00 | ▇▃▁▁▁ |
| sops_g | 0 | 1.00 | 2.79 | 3.72 | 0.00 | 0.00 | 1.00 | 4.25 | 12.00 | ▇▁▁▁▁ |
| sops_tot | 1 | 0.98 | 16.72 | 16.62 | 0.00 | 4.50 | 12.00 | 26.00 | 74.00 | ▇▃▁▁▁ |
| sips_diff_days | 0 | 1.00 | 714.17 | 878.28 | -250.00 | -37.75 | 515.50 | 1024.25 | 2799.00 | ▇▆▂▁▂ |
| sips_diff_yrs | 0 | 1.00 | 1.95 | 2.41 | -0.50 | 0.00 | 1.50 | 2.75 | 7.50 | ▇▅▂▁▂ |
| Sex_1M | 0 | 1.00 | 1.69 | 0.47 | 1.00 | 1.00 | 2.00 | 2.00 | 2.00 | ▃▁▁▁▇ |
| age_scan | 0 | 1.00 | 20.03 | 2.66 | 16.26 | 17.24 | 19.95 | 22.23 | 25.09 | ▇▃▆▅▅ |
| age_sops | 0 | 1.00 | 21.98 | 3.08 | 16.99 | 19.84 | 21.79 | 23.59 | 29.42 | ▅▇▇▂▂ |
| dx_sops_diff | 2 | 0.96 | 0.02 | 1.58 | -7.00 | 0.00 | 0.00 | 0.00 | 8.00 | ▁▁▇▁▁ |
| base_gafc | 2 | 0.96 | 68.39 | 20.86 | 21.00 | 50.00 | 78.00 | 87.00 | 94.00 | ▁▅▂▂▇ |
| base_sops_p | 0 | 1.00 | 6.50 | 7.51 | 0.00 | 1.00 | 4.00 | 9.00 | 29.00 | ▇▂▂▁▁ |
| delta_sops | 0 | 1.00 | 0.48 | 4.68 | -13.00 | 0.00 | 0.00 | 2.00 | 15.00 | ▁▁▇▂▁ |
| R_Thalamus_mean | 0 | 1.00 | 7.31 | 1.66 | 5.14 | 6.05 | 7.05 | 8.10 | 11.01 | ▇▆▆▂▂ |
| R_Thalamus_numvoxels | 0 | 1.00 | 302.98 | 118.71 | 80.00 | 238.00 | 305.00 | 398.50 | 484.00 | ▅▃▇▇▇ |
| R_Thalamus_SD | 0 | 1.00 | 2.25 | 0.54 | 1.45 | 1.79 | 2.23 | 2.59 | 3.50 | ▇▂▆▂▂ |
| gm.R_Thalamus_mean | 0 | 1.00 | 0.62 | 0.09 | 0.40 | 0.56 | 0.60 | 0.66 | 0.86 | ▁▅▇▃▁ |
| gm.R_Thalamus_numvoxels | 0 | 1.00 | 216.10 | 69.30 | 126.00 | 159.25 | 196.00 | 265.50 | 341.00 | ▇▃▅▁▆ |
| gm.R_Thalamus_SD | 0 | 1.00 | 0.35 | 0.03 | 0.23 | 0.35 | 0.35 | 0.37 | 0.39 | ▁▁▁▇▅ |
final.df %>%
dplyr::select_if(is.numeric) %>%
cor(., use = "pairwise.complete.obs") %>%
corrplot(method = "color",
type = "upper",
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt = 45, #Text label color and rotation
# hide correlation coefficient on the principal diagonal
diag = FALSE
)
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is zero
Some basic demographic info on the entire sample:
summary(final.df$age_sops)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.99 19.84 21.79 21.98 23.59 29.42
summary(final.df$age_scan)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.26 17.24 19.95 20.03 22.23 25.09
p.age_scan <- final.df %>%
ggplot() +
geom_histogram(aes(x = age_scan), fill="cadetblue4") +
ggtitle("Age at Scan")
p.age_sops <- final.df %>%
ggplot() +
geom_histogram(aes(x = age_sops), fill="darkslateblue") +
ggtitle("Age at SOPS")
age_plot <- ggarrange(p.age_scan, p.age_sops, common.legend = F, legend = "bottom", ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
annotate_figure(p=age_plot, top = text_grob("Age of Participants at Scan and Clinical Assessments",
color = "black", face = "bold", size = 14))
Visualize spread of followup timepoints for possible future binning:
final.end %>%
ggplot() +
geom_histogram(aes(x = sips_diff_days), fill="darkseagreen4", color="white", bins=10) +
ggtitle("Time to Clinical Follow-Up")
#10 looks like good # of bins
final.bin_scaled <- final.end %>%
mutate(sips_diff_bin=ntile(sips_diff_days, n=10)) %>%
mutate_if(is.numeric, scale)
demo <- final.df %>% filter(scan.time=="base.1") %>%
group_by(scanner) %>%
summarise(n = n(),
age=mean(age_scan),
age_sd=sd(age_scan))
demo.base <- atable(final.base,
target_cols=c("sex", "age_scan", "Race", "Ethnicity", "base_dx", "TYPE", "base_sops_p", "base_gafc", "psych.rx_life"),
group_col="scanner",
format_to="HTML")
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
## Warning in two_sample_htest.factor(value, group, ...): Not enough values.
## Returning p-value=NaN..
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
tpt.table <- atable(final.df,
target_cols=c("scan.time", "age_sops", "sips_diff_yrs", "diagnosis", "sops_p", "sops_n", "sops_d", "sops_g", "GAF_C", "psych.rx_life"),
group_col="scanner",
format_to="HTML")
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
Compare Thalamic CEST data by scanner:
final.df %>%dplyr::select(scanner | contains("Thalamus")) %>% ggpairs(aes(color=scanner))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cest.table <- atable(final.base,
target_cols=c("R_Thalamus_mean", "R_Thalamus_numvoxels", "gm.R_Thalamus_mean", "gm.R_Thalamus_numvoxels"),
group_col="scanner",
format_to = "HTML")
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
Are CEST values or demographics significantly different between scanners?
#check for differences in baseline diagnoses
chisq.test(final.base$scanner, final.base$base_dx, simulate.p.value=TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: final.base$scanner and final.base$base_dx
## X-squared = 2.9448, df = NA, p-value = 0.4913
#check for differences in scan age
chisq.test(final.base$scanner, final.base$age_scan, simulate.p.value=TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: final.base$scanner and final.base$age_scan
## X-squared = 21, df = NA, p-value = 1
#check for differences in baseline SOPS Positive
t.test(base_sops_p ~ scanner, data=final.base)
##
## Welch Two Sample t-test
##
## data: base_sops_p by scanner
## t = 0.24971, df = 18.875, p-value = 0.8055
## alternative hypothesis: true difference in means between group ONM and group Terra is not equal to 0
## 95 percent confidence interval:
## -6.512717 8.276353
## sample estimates:
## mean in group ONM mean in group Terra
## 7.181818 6.300000
#check for differences in thalamic CEST
t.test(R_Thalamus_mean ~ scanner, data = final.base)
##
## Welch Two Sample t-test
##
## data: R_Thalamus_mean by scanner
## t = -4.0043, df = 13.984, p-value = 0.001308
## alternative hypothesis: true difference in means between group ONM and group Terra is not equal to 0
## 95 percent confidence interval:
## -3.432339 -1.037787
## sample estimates:
## mean in group ONM mean in group Terra
## 6.411717 8.646781
#check for differences in thalamic volume captured
t.test(R_Thalamus_numvoxels ~ scanner, data = final.base)
##
## Welch Two Sample t-test
##
## data: R_Thalamus_numvoxels by scanner
## t = -2.2206, df = 17.739, p-value = 0.03966
## alternative hypothesis: true difference in means between group ONM and group Terra is not equal to 0
## 95 percent confidence interval:
## -194.109892 -5.271926
## sample estimates:
## mean in group ONM mean in group Terra
## 271.9091 371.6000
#look for correlations between volume and CEST when controlling for scan
m.cest.vol <- lm(R_Thalamus_mean ~ scanner + R_Thalamus_numvoxels, data=final.base)
summary(m.cest.vol)
##
## Call:
## lm(formula = R_Thalamus_mean ~ scanner + R_Thalamus_numvoxels,
## data = final.base)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6310 -0.5864 -0.1908 0.6115 2.4582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.066894 0.868308 6.987 1.59e-06 ***
## scannerTerra 2.108639 0.624989 3.374 0.00338 **
## R_Thalamus_numvoxels 0.001268 0.002865 0.443 0.66328
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.272 on 18 degrees of freedom
## Multiple R-squared: 0.4761, Adjusted R-squared: 0.4179
## F-statistic: 8.18 on 2 and 18 DF, p-value: 0.002971
Anova(m.cest.vol)
## Anova Table (Type II tests)
##
## Response: R_Thalamus_mean
## Sum Sq Df F value Pr(>F)
## scanner 18.4271 1 11.383 0.003381 **
## R_Thalamus_numvoxels 0.3172 1 0.196 0.663281
## Residuals 29.1386 18
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Scanner is not significantly related to baseline diagnosis, baseline SOPS positive score, or age at scan but is significantly related to CEST contrast (averaged over thalamus) and number of thalamic volumes captured. After controlling for scanner, number of voxels is not significantly associated with mean CEST value in the thalamus, therefore scanner (but not volume) will be controlled for in the final model.
Look at correlations between lifetime psych rx and diagnosis - unsurprisingly these are highly correlated values
kruskal.test(final.df$psych.rx_life~final.df$diagnosis)
##
## Kruskal-Wallis rank sum test
##
## data: final.df$psych.rx_life by final.df$diagnosis
## Kruskal-Wallis chi-squared = 24.175, df = 3, p-value = 2.296e-05
ggplot(final.df) +
geom_count(aes(x = factor(diagnosis), y = as.factor(psych.rx_life))) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggtitle("SOPS Total Score vs Axis 1 Dx")
# final.df %>%
# group_by(BBLID) %>%
# sample_n(1) %>% #randomdplyr::selection
# ggplot(aes(x = diagnosis, y = GAF_C)) +
# geom_point(aes(color=psych.rx_life), show.legend = TRUE) +
# geom_jitter(width = 0.25) +
# theme(axis.text.x = element_text(angle = 45, hjust=1)) +
# ggtitle("GAF vs Axis 1 Dx")
final.df %>%
group_by(BBLID) %>%
ggplot(aes(x = diagnosis, y = sops_p, color=psych.rx_life)) +
geom_point(aes(color=psych.rx_life), show.legend = TRUE) +
geom_jitter(width = 0.25) +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
ggtitle("SOPS Positive Score vs Axis 1 Dx")
t.test(sops_p ~ psych.rx_life, data = final.df)
##
## Welch Two Sample t-test
##
## data: sops_p by psych.rx_life
## t = -4.0896, df = 9.0766, p-value = 0.002671
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -18.722205 -5.397453
## sample estimates:
## mean in group 0 mean in group 1
## 4.717949 16.777778
final.df %>%
group_by(BBLID, SCANID) %>%
mutate(base_rx=first(psych.rx_life)) %>%
t.test(sops_p ~ base_rx, data = .)
##
## Welch Two Sample t-test
##
## data: sops_p by base_rx
## t = -4.719, df = 7.9491, p-value = 0.00153
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -20.141533 -6.908467
## sample estimates:
## mean in group 0 mean in group 1
## 4.725 18.250
Higher SOPS P scores are predictably related to psychosis diagnosis and psychiatric medication.
Look at CEST data by diagnosis and diagnostic category:
#plots
final.base %>% dplyr::select(base_dx | contains("Thalamus")) %>%
filter(base_dx != "other") %>% #too few other dx, need to remove
ggpairs(aes(color=base_dx)) +
ggtitle("Thalamic Cest Values by Baseline Diagnosis")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
final.base %>%
dplyr::select(dx_control | contains("Thalamus")) %>%
mutate(dx_control=recode_factor(as.factor(dx_control), 'TRUE'="Control", 'FALSE'="Patient")) %>%
ggpairs(aes(color=dx_control)) +
ggtitle("Thalamic Cest Values: Patients vs Controls")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#linear model
lm.dx <- lm(R_Thalamus_mean ~ scanner + base_dx + Sex_1M + age_scan, data=final.base)
summary(lm.dx)
##
## Call:
## lm(formula = R_Thalamus_mean ~ scanner + base_dx + Sex_1M + age_scan,
## data = final.base)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.23978 -0.72815 -0.03797 0.51294 2.21595
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6077 2.6598 0.980 0.34352
## scannerTerra 2.4059 0.6405 3.756 0.00213 **
## base_dxother -2.2145 1.6081 -1.377 0.19012
## base_dxpro -0.9952 0.7804 -1.275 0.22301
## base_dxpsych -0.2657 0.8451 -0.314 0.75788
## Sex_1M 0.3039 0.6656 0.457 0.65494
## age_scan 0.1841 0.1298 1.418 0.17804
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.279 on 14 degrees of freedom
## Multiple R-squared: 0.5885, Adjusted R-squared: 0.4121
## F-statistic: 3.337 on 6 and 14 DF, p-value: 0.02961
Anova(lm.dx)
## Anova Table (Type II tests)
##
## Response: R_Thalamus_mean
## Sum Sq Df F value Pr(>F)
## scanner 23.0664 1 14.1081 0.002128 **
## base_dx 4.7503 3 0.9685 0.435121
## Sex_1M 0.3409 1 0.2085 0.654936
## age_scan 3.2879 1 2.0110 0.178038
## Residuals 22.8896 14
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Overall, baseline diagnosis does not appear to be significantly associated with thalamic cest when controlling for sex, age and scanner type, while scanner type does remain predictive.
Looking at baseline diagnosis vs GAF
ggplot(final.df, aes(x=sips_diff_days, y=GAF_C, color=base_dx)) +
geom_point() +
xlab("Time from Scan") +
ylab("GAF") +
ggtitle("GAF Progression by for Each Dx Category") +
geom_smooth(method="lm") +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
sum(is.na(final.df$GAF_C))
## [1] 15
ggplot(final.df, aes(x=sips_diff_days, y=sops_p, color=base_dx)) +
geom_point() +
xlab("Time from Scan") +
ylab("SOPS Positive Score") +
ggtitle("SOPS-P Progression by for Each Dx Category") +
geom_smooth(method="lm") +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
All groups appear to have relatively stable SOPS-P scores when averaged across individuals. But can CEST values predict and individual’s change?
# ggplot(final.df, aes(x=sips_diff_days, y=sops_p, color=BBLID)) +
# geom_point() +
# xlab("Time from Scan") +
# ylab("SOPS Positive Score") +
# ggtitle("SOPS-P Progression by for Each Subject") +
# geom_smooth(method="lm",se = FALSE) +
# theme_bw()
ggplot(final.df, aes(x=sips_diff_days, y=sops_p, color=BBLID)) +
geom_line() +
xlab("Time from Scan") +
ylab("SOPS Positive Score") +
ggtitle("SOPS-P Progression by for Each Subject") +
geom_smooth(method="lm",se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
#plot of change by dx
final.end_scaled %>%
ggplot() +
geom_histogram(aes(x = delta_sops, fill=base_dx, color=base_dx), position="dodge", alpha=0.5) +
ggtitle("Change in SOPS Positive Score by Baseline Diagnosis")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot change in sops positive by glutamate (similar to egerton et al fig 2)
final.end_scaled %>%
filter(base_dx=="pro"| base_dx=="psych")
## # A tibble: 17 × 68
## BBLID date7t scanner SCANID PROTOCOL DOSIPS TYPE P1[,1] P2[,1]
## <chr> <date> <chr> <chr> <chr> <date> <fct> <dbl> <dbl>
## 1 93274 2018-10-23 Terra 10992 833922 - Ev… 2019-11-14 Prob… 1.96 1.60
## 2 99949 2014-08-20 ONM 9031 818964 - TI… 2015-11-05 Prob… -1.06 -0.902
## 3 94276 2018-09-07 Terra 10927 829502 - MO… 2020-02-04 Prob… 0.447 1.60
## 4 18093 2014-06-10 ONM 8924 810336 - Go3 2015-11-08 Prob… 0.951 -0.278
## 5 139272 2014-07-22 ONM 8985 810336 - Go3 2016-01-07 Prob… -0.0559 -0.902
## 6 85369 2014-10-27 ONM 9132 810336 - TI… 2016-04-19 Prob… -1.06 0.347
## 7 99949 2014-08-20 ONM 9031 810336 - GO… 2016-03-19 Prob… -1.06 -0.278
## 8 117397 2018-02-01 Terra 10686 833922 - Ev… 2019-12-05 Prob… -0.559 0.347
## 9 91962 2018-11-30 Terra 11090 829502 - MO… 2020-10-26 Prob… 1.45 1.60
## 10 99949 2014-08-20 ONM 9031 818964 - TI… 2016-10-19 Prob… -1.06 -0.278
## 11 132179 2018-08-15 Terra 10918 829502 - MO… 2020-12-09 Prob… -0.0559 0.347
## 12 91919 2018-01-31 Terra 10683 829502 - MO… 2020-08-12 Prob… -1.06 0.347
## 13 125073 2018-10-11 Terra 10972 833922 - Ev… 2021-05-13 Prob… 0.951 0.971
## 14 139272 2014-07-22 ONM 8985 16-013305 -… 2018-02-17 Prob… -0.0559 -0.902
## 15 120217 2014-08-19 ONM 9029 16-013305 -… 2019-02-22 Prob… 1.96 2.84
## 16 99949 2014-08-20 ONM 9031 833922 - Ev… 2021-06-01 Prob… -0.0559 -0.278
## 17 106573 2014-05-22 ONM 8900 843329 - Lo… 2022-01-19 Prob… 1.45 0.347
## # … with 59 more variables: P3 <dbl[,1]>, P4 <dbl[,1]>, P5 <dbl[,1]>,
## # N1 <dbl[,1]>, N2 <dbl[,1]>, N3 <dbl[,1]>, N4 <dbl[,1]>, N5 <dbl[,1]>,
## # N6 <dbl[,1]>, D1 <dbl[,1]>, D2 <dbl[,1]>, D3 <dbl[,1]>, D4 <dbl[,1]>,
## # G1 <dbl[,1]>, G2 <dbl[,1]>, G3 <dbl[,1]>, G4 <dbl[,1]>, GAF_C <dbl[,1]>,
## # GAF_H <dbl[,1]>, GAF_PCTCHG <dbl[,1]>, SPD <chr>, FHPI <chr>,
## # FIRST_DEG_SCZ <chr>, sops_p <dbl[,1]>, sops_n <dbl[,1]>, sops_d <dbl[,1]>,
## # sops_g <dbl[,1]>, sops_tot <dbl[,1]>, sips_diff_days <dbl[,1]>, …
ggplot(data = final.end_scaled, mapping = aes(x = R_Thalamus_mean, y = delta_sops)) +
geom_point(na.rm = T, aes(col = diagnosis)) +
geom_smooth(method = "lm", na.rm=T, col = "black", se=T) +
theme(legend.position = "right") +
labs(title="Thalamic Glutamate by Change in SOPS Positive \n Among Prodrome and Psychosis Pts (Scaled)",
x ="Thalamic Glutamate Contrast", y = "Change in SOPS Positive", color="Endpoint Diagnosis")
## `geom_smooth()` using formula 'y ~ x'
The plot is vaguely suggestive of that in Egerton et al, but clearly not significant. However, these are scaled raw values while their plot reported “Baseline thalamic glutamate level, adjusted for baseline attenuated positive symptom score”.
Plotting simple linear model for visualization.
ggplot(data = final.df, mapping = aes(x = sips_diff_days, y = sops_p)) +
geom_point(na.rm = T, aes(col = BBLID)) +
geom_smooth(method = "lm", na.rm = T, col = "black", se = F) +
theme(legend.position = "right")
## `geom_smooth()` using formula 'y ~ x'
At the population level, SOPS Positive scores remained stable over time.
Lm to look for any interactions of thalamic CEST and diagnosis:
lm.int <- lm(sops_p ~ R_Thalamus_mean*base_dx, data=final.df)
Anova(lm.int)
## Note: model has aliased coefficients
## sums of squares computed by model comparison
## Anova Table (Type II tests)
##
## Response: sops_p
## Sum Sq Df F value Pr(>F)
## R_Thalamus_mean 9.96 1 0.2992 0.5874
## base_dx 1034.24 3 10.3536 3.353e-05 ***
## R_Thalamus_mean:base_dx 0.59 2 0.0089 0.9912
## Residuals 1365.18 41
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm.int2 <- lm(sops_p ~ R_Thalamus_mean*R_Thalamus_numvoxels, data=final.df)
Anova(lm.int2)
## Anova Table (Type II tests)
##
## Response: sops_p
## Sum Sq Df F value Pr(>F)
## R_Thalamus_mean 11.38 1 0.2152 0.6450
## R_Thalamus_numvoxels 26.79 1 0.5068 0.4803
## R_Thalamus_mean:R_Thalamus_numvoxels 46.93 1 0.8876 0.3513
## Residuals 2326.29 44
lm.int3 <- lm(sops_p ~ R_Thalamus_mean*sips_diff_days, data=final.df)
Anova(lm.int3)
## Anova Table (Type II tests)
##
## Response: sops_p
## Sum Sq Df F value Pr(>F)
## R_Thalamus_mean 42.15 1 0.7758 0.3832
## sips_diff_days 0.02 1 0.0003 0.9854
## R_Thalamus_mean:sips_diff_days 9.80 1 0.1803 0.6731
## Residuals 2390.19 44
No evidence of meaningful interactions that should be included in the final model.
Using scaled data so model will converge more readily. No need for random slopes, since each subject’s delta_sops allows for varying clinical trajectories all on their own. Slope in this case will be analogous to the relationship between cest and improvement, which we hope will be the same/generalizable across subjects, rather than between cest and scores.
fit.delta.1 <- lmer(delta_sops ~ sips_diff_days + (1|BBLID), REML = T, data = final.end_scaled )
summary(fit.delta.1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ sips_diff_days + (1 | BBLID)
## Data: final.end_scaled
##
## REML criterion at convergence: 69.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.60842 -0.15693 -0.03706 0.22819 1.57359
##
## Random effects:
## Groups Name Variance Std.Dev.
## BBLID (Intercept) 0.6808 0.8251
## Residual 0.2272 0.4767
## Number of obs: 27, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.0742 0.2048 19.1533 0.362 0.7211
## sips_diff_days 0.3455 0.1538 15.5793 2.247 0.0395 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## sps_dff_dys 0.029
#plot
delta1_coefs <- coef(fit.delta.1)$BBLID %>%
rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>%
rownames_to_column("BBLID")
model.end.1.data <- left_join(final.end_scaled, delta1_coefs, by="BBLID")
ggplot(data = model.end.1.data, mapping = aes(x = sips_diff_days, y = delta_sops)) +
geom_point(na.rm = T, aes(col = BBLID)) +
geom_abline(aes(intercept = Intercept,
slope = Slope,
colour = BBLID
),
size = 0.5
) +
theme(legend.position = "right")
Adding baseline SOPS P to model
fit.delta.2 <- lmer(delta_sops ~ sips_diff_days + base_sops_p + (1|BBLID), REML = T, data = final.end_scaled)
summary(fit.delta.2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ sips_diff_days + base_sops_p + (1 | BBLID)
## Data: final.end_scaled
##
## REML criterion at convergence: 67
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.66332 -0.26629 -0.01428 0.13924 1.50440
##
## Random effects:
## Groups Name Variance Std.Dev.
## BBLID (Intercept) 0.5487 0.7408
## Residual 0.2231 0.4723
## Number of obs: 27, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.09076 0.18850 18.19000 0.481 0.6359
## sips_diff_days 0.25698 0.15349 15.11621 1.674 0.1146
## base_sops_p -0.40231 0.18643 20.93004 -2.158 0.0427 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) sps_d_
## sps_dff_dys 0.014
## base_sops_p -0.045 0.289
# #plot
# delta2_coefs <- coef(fit.delta.2)$BBLID %>%
# rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>%
# rownames_to_column("BBLID")
#
# model.end.2.data <- left_join(final.bin_scaled, delta2_coefs, by="BBLID")
#
# ggplot(data = model.end.2.data, mapping = aes(x = sips_diff_days, y = delta_sops)) +
# geom_point(na.rm = T, aes(col = BBLID)) +
# geom_abline(aes(intercept = Intercept,
# slope = Slope,
# colour = BBLID
# ),
# size = 0.5
# ) +
# theme(legend.position = "right")
Adding demographics
fit.delta.3 <- lmer(delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + (1 |BBLID), REML = T, data = final.end_scaled)
summary(fit.delta.3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days +
## (1 | BBLID)
## Data: final.end_scaled
##
## REML criterion at convergence: 69
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.64720 -0.24782 -0.00751 0.16224 1.52153
##
## Random effects:
## Groups Name Variance Std.Dev.
## BBLID (Intercept) 0.6198 0.7873
## Residual 0.2219 0.4710
## Number of obs: 27, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.08304 0.20406 15.63767 0.407 0.6896
## Sex_1M 0.10920 0.21098 17.69300 0.518 0.6112
## age_scan 0.08623 0.22518 15.92753 0.383 0.7068
## base_sops_p -0.46468 0.21659 19.19893 -2.145 0.0449 *
## sips_diff_days 0.24144 0.16183 13.04497 1.492 0.1595
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Sex_1M ag_scn bs_sp_
## Sex_1M 0.098
## age_scan -0.242 -0.046
## base_sops_p -0.003 -0.331 -0.277
## sps_dff_dys -0.029 -0.246 0.089 0.303
#Anova(fit.delta.3)
# #plot
# delta3_coefs <- coef(fit.delta.3)$BBLID %>%
# rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>%
# rownames_to_column("BBLID")
#
# model.end.3.data <- left_join(final.bin_scaled, delta3_coefs, by="BBLID")
#
# ggplot(data = model.end.3.data, mapping = aes(x = sips_diff_days, y = delta_sops)) +
# geom_point(na.rm = T, aes(col = BBLID)) +
# geom_abline(aes(intercept = Intercept,
# slope = Slope,
# colour = BBLID
# ),
# size = 0.5
# ) +
# theme(legend.position = "right")
Adding in CEST values along with scanner, since Thalamic glucest contrast varies significantly between scanner.
fit.delta.4 <- lmer(delta_sops ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_sops_p + sips_diff_days + (1 |BBLID), REML = T, data = final.end_scaled )
summary(fit.delta.4)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ R_Thalamus_mean + scanner + Sex_1M + age_scan +
## base_sops_p + sips_diff_days + (1 | BBLID)
## Data: final.end_scaled
##
## REML criterion at convergence: 66.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.62261 -0.23859 -0.05075 0.07283 1.62260
##
## Random effects:
## Groups Name Variance Std.Dev.
## BBLID (Intercept) 0.5497 0.7414
## Residual 0.2430 0.4930
## Number of obs: 27, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.26974 0.30711 15.35941 -0.878 0.3933
## R_Thalamus_mean -0.08515 0.27964 14.38930 -0.304 0.7651
## scannerTerra 0.85717 0.59174 17.87691 1.449 0.1648
## Sex_1M 0.11436 0.20557 15.12336 0.556 0.5861
## age_scan -0.04455 0.24164 14.01050 -0.184 0.8564
## base_sops_p -0.35572 0.22190 18.48650 -1.603 0.1259
## sips_diff_days 0.37532 0.18367 9.28487 2.043 0.0704 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) R_Thl_ scnnrT Sex_1M ag_scn bs_sp_
## R_Thalms_mn 0.440
## scannerTerr -0.766 -0.614
## Sex_1M 0.072 0.083 -0.019
## age_scan 0.016 -0.205 -0.178 -0.072
## base_sops_p -0.233 -0.110 0.296 -0.315 -0.324
## sps_dff_dys -0.341 -0.114 0.414 -0.219 -0.069 0.407
#Anova(fit.delta.4)
#plot
delta4_coefs <- coef(fit.delta.4)$BBLID %>%
rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>%
rownames_to_column("BBLID")
model.end.4.data <- left_join(final.end_scaled, delta4_coefs, by="BBLID")
ggplot(data = model.end.4.data, mapping = aes(x = sips_diff_days, y = delta_sops)) +
geom_point(na.rm = T, aes(col = BBLID)) +
geom_abline(aes(intercept = Intercept,
slope = Slope,
colour = BBLID
),
size = 0.5
) +
theme(legend.position = "right")
Visualizing relationship with 3D plot
# # Compute the linear regression (z = ax + by + d)
# fit.3d <- lm(final.bin_scaled$delta_sops ~ final.bin_scaled$sips_diff_days + final.bin_scaled$R_Thalamus_mean)
# # predict values on regular xy grid
# grid.lines = 26
# x.pred <- seq(min(final.bin_scaled$sips_diff_days), max(final.bin_scaled$sips_diff_days), length.out = grid.lines)
# y.pred <- seq(min(final.bin_scaled$R_Thalamus_mean), max(final.bin_scaled$R_Thalamus_mean), length.out = grid.lines)
# xy <- expand.grid( x = x.pred, y = y.pred)
# z.pred <- matrix(predict(fit.3d, newdata = xy),
# nrow = grid.lines, ncol = grid.lines)
# # fitted points for droplines to surface
# fitpoints <- predict(fit.3d)
scatter3D(final.end_scaled$sips_diff_days, final.end_scaled$R_Thalamus_mean, final.end_scaled$delta_sops, colvar = final.end_scaled$R_Thalamus_mean, bty = "b2", col = NULL, add = FALSE, theta = 110, phi = 20, main = "", xlab = "Time", ylab ="Thalamic CEST", zlab = "Change in SOPS P")
plotrgl()
Given the small sample size, it’s risky to add more predictors. However, we’ll try just adding the indicator for lifetime psych medication at time of scan.
fit.delta.5 <- lmer(delta_sops ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + (1 |BBLID), REML = T, data = final.end_scaled )
summary(fit.delta.5)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ R_Thalamus_mean + scanner + Sex_1M + age_scan +
## base_sops_p + sips_diff_days + base_rx + (1 | BBLID)
## Data: final.end_scaled
##
## REML criterion at convergence: 59.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.72014 -0.36236 -0.04593 0.30946 1.39239
##
## Random effects:
## Groups Name Variance Std.Dev.
## BBLID (Intercept) 0.3177 0.5636
## Residual 0.2395 0.4894
## Number of obs: 27, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.518725 0.269563 12.817737 -1.924 0.07680 .
## R_Thalamus_mean -0.124063 0.233369 13.341957 -0.532 0.60373
## scannerTerra 0.798900 0.504606 17.040425 1.583 0.13175
## Sex_1M 0.186266 0.174901 14.196357 1.065 0.30467
## age_scan 0.007742 0.201728 13.001764 0.038 0.96997
## base_sops_p -0.810457 0.255229 16.701783 -3.175 0.00563 **
## sips_diff_days 0.372218 0.172108 11.663537 2.163 0.05209 .
## base_rx1 1.628971 0.591683 14.060745 2.753 0.01550 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) R_Thl_ scnnrT Sex_1M ag_scn bs_sp_ sps_d_
## R_Thalms_mn 0.440
## scannerTerr -0.714 -0.602
## Sex_1M 0.026 0.089 -0.058
## age_scan 0.013 -0.207 -0.198 -0.044
## base_sops_p 0.018 -0.050 0.291 -0.364 -0.309
## sps_dff_dys -0.334 -0.127 0.458 -0.251 -0.089 0.385
## base_rx1 -0.304 -0.062 -0.071 0.184 0.096 -0.669 -0.080
#Anova(fit.delta.5)
Finally, I’ll fit a full model without CEST data for comparison.
fit.delta.6 <- lmer(delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + (1 |BBLID), REML = T, data = final.end_scaled)
summary(fit.delta.6)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days +
## base_rx + (1 | BBLID)
## Data: final.end_scaled
##
## REML criterion at convergence: 61.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.69054 -0.30913 -0.08562 0.29935 1.43460
##
## Random effects:
## Groups Name Variance Std.Dev.
## BBLID (Intercept) 0.3645 0.6037
## Residual 0.2273 0.4768
## Number of obs: 27, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.2132 0.1959 13.3378 -1.089 0.29560
## Sex_1M 0.1991 0.1798 16.0388 1.107 0.28451
## age_scan 0.1123 0.1864 13.8319 0.603 0.55638
## base_sops_p -0.9478 0.2482 16.8593 -3.818 0.00139 **
## sips_diff_days 0.2307 0.1502 15.6798 1.536 0.14441
## base_rx1 1.7329 0.6040 15.8504 2.869 0.01122 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Sex_1M ag_scn bs_sp_ sps_d_
## Sex_1M -0.021
## age_scan -0.206 -0.034
## base_sops_p 0.341 -0.377 -0.225
## sps_dff_dys -0.015 -0.268 0.098 0.263
## base_rx1 -0.510 0.191 0.034 -0.674 -0.024
#Anova(fit.delta.6)
Using anova to compare nested model fits:
anova(fit.delta.5, fit.delta.6, test="Chisq")
## refitting model(s) with ML (instead of REML)
## Data: final.end_scaled
## Models:
## fit.delta.6: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + (1 | BBLID)
## fit.delta.5: delta_sops ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + (1 | BBLID)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## fit.delta.6 8 68.276 78.643 -26.138 52.276
## fit.delta.5 10 68.285 81.243 -24.142 48.285 3.9912 2 0.1359
There is no significant difference between fit.delta.5 and fit.delta.6, suggesting that adding thalamic CEST values do not improve the model.
“The residual variance estimate can be thought of as the within groups variance and each random effect variance estimate can be thought of as a between groups estimate”
Diagnostics
#linearity - plotting residuals
plot(fit.delta.6)
#normality - plotting quantiles
qqnorm(residuals(fit.delta.6))
Heteroscedastic - model fits less well at higher (and more important) sops_p score changes; adding base_dx to try to improve fit.
fit.delta.7 <- lmer(delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + base_dx + (1|BBLID), REML = T, data = final.end_scaled)
summary(fit.delta.7)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days +
## base_rx + base_dx + (1 | BBLID)
## Data: final.end_scaled
##
## REML criterion at convergence: 55.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6842 -0.2726 -0.1425 0.2651 1.5620
##
## Random effects:
## Groups Name Variance Std.Dev.
## BBLID (Intercept) 0.4259 0.6526
## Residual 0.2139 0.4625
## Number of obs: 27, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.47328 0.37352 14.33122 -1.267 0.22533
## Sex_1M 0.15545 0.20455 14.91548 0.760 0.45912
## age_scan 0.04311 0.22122 11.93979 0.195 0.84876
## base_sops_p -0.95435 0.29883 14.17117 -3.194 0.00642 **
## sips_diff_days 0.27940 0.16038 12.47019 1.742 0.10607
## base_rx1 2.28260 0.86868 10.73064 2.628 0.02394 *
## base_dxother 0.15209 1.01056 14.34766 0.150 0.88247
## base_dxpro 0.56908 0.49614 14.35397 1.147 0.27014
## base_dxpsych -0.20011 0.92859 11.49886 -0.216 0.83315
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Sex_1M ag_scn bs_sp_ sps_d_ bs_rx1 bs_dxt bs_dxpr
## Sex_1M 0.264
## age_scan 0.091 0.137
## base_sops_p 0.362 -0.308 -0.298
## sps_dff_dys -0.284 -0.350 -0.033 0.162
## base_rx1 -0.076 0.166 -0.062 -0.225 -0.042
## base_dxothr -0.340 -0.349 -0.440 0.255 0.203 -0.032
## base_dxpro -0.799 -0.309 -0.266 -0.169 0.324 0.052 0.375
## bas_dxpsych -0.569 -0.183 0.016 -0.383 0.191 -0.589 0.131 0.442
#Anova(fit.delta.7)
anova(fit.delta.6, fit.delta.7, test="Chisq")
## refitting model(s) with ML (instead of REML)
## Data: final.end_scaled
## Models:
## fit.delta.6: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + (1 | BBLID)
## fit.delta.7: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + base_dx + (1 | BBLID)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## fit.delta.6 8 68.276 78.643 -26.138 52.276
## fit.delta.7 11 71.236 85.490 -24.618 49.236 3.0401 3 0.3855
Baseline dx does not appear to add to the fit of the model.
Classifying each subject as progression vs remission based on whether trajectory from baseline to final endpoint SOPS-P is + or -, looking for between-group CEST differences (similar to Allen et al Fig 3A & binary logistic regression run in Egerton).
#first subset df to final tp for each subject
end.df <- final.df %>%
group_by(BBLID) %>%
slice_tail() %>%
ungroup() %>%
mutate(sops_worse = case_when(sops_cat == "remit" ~ 0,
sops_cat == "stable" ~ 0,
sops_cat == "progress" ~ 1))
table(end.df$sops_worse) #pretty even split
##
## 0 1
## 10 11
fit.stat <- glm(sops_worse ~ R_Thalamus_mean + scanner, data=end.df, family=binomial(logit))
Anova(fit.stat)
## Analysis of Deviance Table (Type II tests)
##
## Response: sops_worse
## LR Chisq Df Pr(>Chisq)
## R_Thalamus_mean 0.59186 1 0.4417
## scanner 1.02959 1 0.3103
#confirm that these results don't change when removing controls
end.df.pts <- end.df %>%
filter(base_dx!= "none")
fit.stat.pt <- glm(sops_worse ~ R_Thalamus_mean + scanner, data=end.df.pts, family=binomial(logit))
Anova(fit.stat.pt)
## Analysis of Deviance Table (Type II tests)
##
## Response: sops_worse
## LR Chisq Df Pr(>Chisq)
## R_Thalamus_mean 0.88887 1 0.3458
## scanner 1.02995 1 0.3102
Thalamic glutamate levels do not differ between remitters and progressors in our sample, whether or not controls are included in the sample.
#controlling for clinical factors
fit.stat2 <- glm(sops_worse ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_sops_p, data=end.df, family=binomial(logit))
Anova(fit.stat2)
## Analysis of Deviance Table (Type II tests)
##
## Response: sops_worse
## LR Chisq Df Pr(>Chisq)
## R_Thalamus_mean 0.18528 1 0.66688
## scanner 0.69070 1 0.40592
## Sex_1M 0.13451 1 0.71380
## age_scan 0.02521 1 0.87383
## base_sops_p 2.83737 1 0.09209 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#clinical factors only
fit.stat3 <- glm(sops_worse ~ Sex_1M + age_scan + base_sops_p + base_rx + base_rx, data=end.df, family=binomial(logit))
Anova(fit.stat3)
## Analysis of Deviance Table (Type II tests)
##
## Response: sops_worse
## LR Chisq Df Pr(>Chisq)
## Sex_1M 0.3231 1 0.569751
## age_scan 0.0029 1 0.956901
## base_sops_p 6.7927 1 0.009153 **
## base_rx 3.5063 1 0.061136 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit.stat3)
##
## Call:
## glm(formula = sops_worse ~ Sex_1M + age_scan + base_sops_p +
## base_rx + base_rx, family = binomial(logit), data = end.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8130 -0.6865 0.3842 0.9616 1.5516
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.06918 4.65601 -0.015 0.988
## Sex_1M 0.64420 1.14271 0.564 0.573
## age_scan 0.01214 0.22480 0.054 0.957
## base_sops_p -0.28803 0.13881 -2.075 0.038 *
## base_rx1 3.77544 2.48210 1.521 0.128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 29.065 on 20 degrees of freedom
## Residual deviance: 21.761 on 16 degrees of freedom
## AIC: 31.761
##
## Number of Fisher Scoring iterations: 4
Running partial correlations (as in Egerton et al) on baseline glutamate vs SOPS P (“Relationships between baseline glutamate concentrations and absolute change in attenuated psychotic symptoms or GAF score over time were calculated using partial correlation coefficients, co-varying for baselines cores to control for regression to the mean”)
pcor.df <- end.df %>%
mutate(scan.num = case_when(scanner == "Terra" ~ 0,
scanner == "ONM" ~ 1)) %>% #must be numeric for pcor
dplyr::select("base_sops_p", "sops_p", "sips_diff_days", "R_Thalamus_mean", "scan.num")
pcor(pcor.df, method="pearson")
## $estimate
## base_sops_p sops_p sips_diff_days R_Thalamus_mean
## base_sops_p 1.0000000 0.7087387 -0.6621693 0.2202522
## sops_p 0.7087387 1.0000000 0.3791406 -0.1104574
## sips_diff_days -0.6621693 0.3791406 1.0000000 0.1760602
## R_Thalamus_mean 0.2202522 -0.1104574 0.1760602 1.0000000
## scan.num 0.5858204 -0.3660444 0.7327678 -0.5921479
## scan.num
## base_sops_p 0.5858204
## sops_p -0.3660444
## sips_diff_days 0.7327678
## R_Thalamus_mean -0.5921479
## scan.num 1.0000000
##
## $p.value
## base_sops_p sops_p sips_diff_days R_Thalamus_mean
## base_sops_p 0.0000000000 0.0009926069 0.0027546090 0.379823145
## sops_p 0.0009926069 0.0000000000 0.1207399309 0.662599508
## sips_diff_days 0.0027546090 0.1207399309 0.0000000000 0.484661945
## R_Thalamus_mean 0.3798231454 0.6625995082 0.4846619451 0.000000000
## scan.num 0.0106297365 0.1351960432 0.0005422734 0.009621919
## scan.num
## base_sops_p 0.0106297365
## sops_p 0.1351960432
## sips_diff_days 0.0005422734
## R_Thalamus_mean 0.0096219190
## scan.num 0.0000000000
##
## $statistic
## base_sops_p sops_p sips_diff_days R_Thalamus_mean scan.num
## base_sops_p 0.0000000 4.018527 -3.5346130 0.9031884 2.891366
## sops_p 4.0185273 0.000000 1.6389263 -0.4445500 -1.573374
## sips_diff_days -3.5346130 1.638926 0.0000000 0.7154161 4.307366
## R_Thalamus_mean 0.9031884 -0.444550 0.7154161 0.0000000 -2.939322
## scan.num 2.8913662 -1.573374 4.3073659 -2.9393216 0.000000
##
## $n
## [1] 21
##
## $gp
## [1] 3
##
## $method
## [1] "pearson"
No significant association of thalamic glutamate with final SOPS-P scores is found when controlling for baseline SOPS, time to endpoint, and scanner.
tim=rep(10:19,10)
id=rep(1:10,each=10)
rand.int=rep(rnorm(10,0,1),each=10)
rand.slop=rep(rnorm(10,0,1),each=10)
e=rnorm(100,0,0.5)
y1=10+rand.int+tim+e
y2=10+rand.int+tim+e
y3=10+rand.int+tim+rand.slop*tim+e
reg1=lmer(y1~tim+(tim|id))
summary(reg1)
reg2=lmer(y2~tim+(tim|id))
summary(reg2)
#plot
reg2_coef <- coef(reg2)$id %>%
rename(Intercept = `(Intercept)`, Slope = tim) %>%
rownames_to_column("id")
y2.data <- data.frame(y2, tim, id) %>%
mutate(id=as.character(id))
reg2.data <- left_join(y2.data, reg2_coef, by="id")
ggplot(data = reg2.data, mapping = aes(x = tim, y = y2)) +
geom_point(na.rm = T, aes(col = id)) +
geom_abline(aes(intercept = Intercept,
slope = Slope,
colour = id
),
size = 0.5
)
reg2a=lmer(y2~tim+(1|id))
summary(reg2a)
#plot
reg2a_coef <- coef(reg2a)$id %>%
rename(Intercept = `(Intercept)`, Slope = tim) %>%
rownames_to_column("id")
reg2a.data <- left_join(y2.data, reg2a_coef, by="id")
ggplot(data = reg2a.data, mapping = aes(x = tim, y = y2)) +
geom_point(na.rm = T, aes(col = id)) +
geom_abline(aes(intercept = Intercept,
slope = Slope,
colour = id
),
size = 0.5
)
reg2b=lmer(y2~tim+(-1+tim|id))
summary(reg2b)
#plot
reg2b_coef <- coef(reg2b)$id %>%
rename(Intercept = `(Intercept)`, Slope = tim) %>%
rownames_to_column("id")
reg2b.data <- left_join(y2.data, reg2b_coef, by="id")
ggplot(data = reg2b.data, mapping = aes(x = tim, y = y2)) +
geom_point(na.rm = T, aes(col = id)) +
geom_abline(aes(intercept = Intercept,
slope = Slope,
colour = id
),
size = 0.5
)
reg3=lmer(y3~tim+(tim|id))
summary(reg3)
Using scaled data so model will converge more readily. Building models with predictors chosen based on a priori subject knowledge and relationships shown above, getting sequentially more complex to avoid overfitting. Building models inclusive of baseline scores will assess for relationships between thalamic cest and SOPS-P, allowing that relationship to vary linearly over time (random slope for sips_diff_days). However, because the primary interest of the study is change in SOPS P sxs, we want baseline SOPS to be a predictor of final SOPS (or change in SOPS) not predicted necessarily. Basically, i’m not hypothesizing that the relationship between sops and cest will vary linearly over time by individual (though thats’ maybe interesting?); more directly i want to know if later SOPS are related to CEST.
lmer.1 <- lmer(sops_p ~ sips_diff_days + (1|BBLID), REML = T, data = final.df_scaled)
summary(lmer.1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sops_p ~ sips_diff_days + (1 | BBLID)
## Data: final.df_scaled
##
## REML criterion at convergence: 123.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.1337 -0.4686 -0.1190 0.2692 2.3313
##
## Random effects:
## Groups Name Variance Std.Dev.
## BBLID (Intercept) 0.7948 0.8915
## Residual 0.3271 0.5720
## Number of obs: 48, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.05839 0.21220 19.15213 0.275 0.786
## sips_diff_days 0.15460 0.09762 28.91527 1.584 0.124
##
## Correlation of Fixed Effects:
## (Intr)
## sps_dff_dys 0.022
#plot
model_coefs <- coef(lmer.1)$BBLID %>%
rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>%
rownames_to_column("BBLID")
model1.data <- left_join(final.df_scaled, model_coefs, by="BBLID")
ggplot(data = model1.data, mapping = aes(x = sips_diff_days, y = sops_p)) +
geom_point(na.rm = T, aes(col = BBLID)) +
geom_abline(aes(intercept = Intercept,
slope = Slope,
colour = BBLID
),
size = 0.5
) +
theme(legend.position = "right")
lmer.2 <- lmer(sops_p ~ base_dx + (1 + sips_diff_yrs|BBLID), REML = T, data = final.df_scaled )
summary(lmer.2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sops_p ~ base_dx + (1 + sips_diff_yrs | BBLID)
## Data: final.df_scaled
##
## REML criterion at convergence: 104.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.3310 -0.5417 -0.1086 0.4680 2.1843
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## BBLID (Intercept) 0.28689 0.5356
## sips_diff_yrs 0.05695 0.2386 0.28
## Residual 0.29841 0.5463
## Number of obs: 48, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.674097 0.250865 15.607479 -2.687 0.016456 *
## base_dxother 0.009397 0.700487 14.511855 0.013 0.989479
## base_dxpro 0.599953 0.338429 15.604001 1.773 0.095786 .
## base_dxpsych 1.869096 0.380732 16.312411 4.909 0.000149 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) bs_dxt bs_dxpr
## base_dxothr -0.358
## base_dxpro -0.741 0.265
## bas_dxpsych -0.659 0.236 0.488
#plot
model_coefs2 <- coef(lmer.2)$BBLID %>%
rename(Intercept = `(Intercept)`, Slope = sips_diff_yrs) %>%
rownames_to_column("BBLID")
model2.data <- left_join(final.df_scaled, model_coefs2, by="BBLID")
ggplot(data = model2.data, mapping = aes(x = sips_diff_yrs, y = sops_p)) +
geom_point(na.rm = T, aes(col = BBLID)) +
geom_abline(aes(intercept = Intercept,
slope = Slope,
colour = BBLID
),
size = 0.5
) +
theme(legend.position = "right") +
ggtitle("Model on Scaled Numeric Data")
Simple random slopes model lmer(sops_p ~ (1 + sips_diff_yrs|BBLID), REML = T, data = final.df_scaled) singular with correlation of -1, because ppts with higher intercepts have more negative slopes (i.e. worse baseline sxs -> generally more improvement). Adding base_dx term resolves this singularity (i.e. shakes up this colinearity) by (it seems) accounting for intercept values outside of the random effect of BBLID.
Adding standard demographics (sex and age at scan)
lmer.3 <- lmer(sops_p ~ Sex_1M + age_scan + base_dx + (1 + sips_diff_days|BBLID), REML = T, data = final.df_scaled )
summary(lmer.3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sops_p ~ Sex_1M + age_scan + base_dx + (1 + sips_diff_days |
## BBLID)
## Data: final.df_scaled
##
## REML criterion at convergence: 105.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.39393 -0.54091 -0.05436 0.51024 2.51830
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## BBLID (Intercept) 0.27950 0.5287
## sips_diff_days 0.03395 0.1843 0.80
## Residual 0.31824 0.5641
## Number of obs: 48, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.4764 0.2766 16.6768 -1.722 0.10354
## Sex_1M 0.1708 0.1471 15.2497 1.161 0.26348
## age_scan 0.2612 0.1641 14.2664 1.592 0.13336
## base_dxother -0.7168 0.7698 15.5478 -0.931 0.36605
## base_dxpro 0.2566 0.3731 15.6579 0.688 0.50169
## base_dxpsych 1.5470 0.4155 15.5293 3.723 0.00194 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Sex_1M ag_scn bs_dxt bs_dxpr
## Sex_1M 0.346
## age_scan 0.288 0.044
## base_dxothr -0.486 -0.264 -0.395
## base_dxpro -0.801 -0.312 -0.364 0.432
## bas_dxpsych -0.742 -0.320 -0.363 0.411 0.620
#Anova(lmer.3)
#plot
model_coefs3 <- coef(lmer.3)$BBLID %>%
rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>%
rownames_to_column("BBLID")
model3.data <- left_join(final.df_scaled, model_coefs3, by="BBLID")
ggplot(data = model3.data, mapping = aes(x = sips_diff_days, y = sops_p)) +
geom_point(na.rm = T, aes(col = BBLID)) +
geom_abline(aes(intercept = Intercept,
slope = Slope,
colour = BBLID
),
size = 0.5
) +
theme(legend.position = "right") +
ggtitle("Model on Scaled Numeric Data")
Adding in CEST values along with scanner, since Thalamic glucest contrast varies significantly between scanner.
lmer.4 <- lmer(sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx + (1 + sips_diff_days|BBLID), REML = T, data = final.df_scaled )
summary(lmer.4)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx +
## (1 + sips_diff_days | BBLID)
## Data: final.df_scaled
##
## REML criterion at convergence: 106.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.36282 -0.50539 -0.07076 0.45835 2.41745
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## BBLID (Intercept) 0.31860 0.5644
## sips_diff_days 0.03786 0.1946 0.77
## Residual 0.31720 0.5632
## Number of obs: 48, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.5705 0.3112 13.7974 -1.833 0.08843 .
## R_Thalamus_mean -0.1447 0.2238 12.3298 -0.646 0.52998
## scannerTerra 0.3513 0.4600 12.3547 0.764 0.45933
## Sex_1M 0.2033 0.1611 12.7517 1.262 0.22970
## age_scan 0.2906 0.1901 12.6545 1.529 0.15090
## base_dxother -1.0043 0.8835 12.7599 -1.137 0.27653
## base_dxpro 0.1290 0.4249 12.7810 0.304 0.76636
## base_dxpsych 1.5190 0.4357 12.8304 3.487 0.00409 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) R_Thl_ scnnrT Sex_1M ag_scn bs_dxt bs_dxpr
## R_Thalms_mn 0.212
## scannerTerr -0.373 -0.713
## Sex_1M 0.186 -0.147 0.295
## age_scan 0.226 -0.379 0.121 0.035
## base_dxothr -0.260 0.356 -0.411 -0.341 -0.410
## base_dxpro -0.534 0.338 -0.397 -0.380 -0.381 0.528
## bas_dxpsych -0.640 0.093 -0.112 -0.332 -0.345 0.418 0.608
Anova(lmer.4)
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: sops_p
## Chisq Df Pr(>Chisq)
## R_Thalamus_mean 0.4176 1 0.5181
## scanner 0.5834 1 0.4450
## Sex_1M 1.5915 1 0.2071
## age_scan 2.3375 1 0.1263
## base_dx 21.9961 3 6.535e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot
model_coefs4 <- coef(lmer.4)$BBLID %>%
rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>%
rownames_to_column("BBLID")
model4.data <- left_join(final.df_scaled, model_coefs4, by="BBLID")
ggplot(data = model4.data, mapping = aes(x = sips_diff_days, y = sops_p)) +
geom_point(na.rm = T, aes(col = BBLID)) +
geom_abline(aes(intercept = Intercept,
slope = Slope,
colour = BBLID
),
size = 0.5
) +
theme(legend.position = "right") +
ggtitle("Model on Scaled Numeric Data")
Given the small sample size, it’s risky to add more predictors. However, we’ll try just adding the lifetime psych medication indicator.
lmer.5 <- lmer(sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days|BBLID), REML = T, data = final.df_scaled )
summary(lmer.5)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx +
## psych.rx_life + (1 + sips_diff_days | BBLID)
## Data: final.df_scaled
##
## REML criterion at convergence: 101.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.25643 -0.53706 -0.07254 0.55213 2.08859
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## BBLID (Intercept) 0.17781 0.4217
## sips_diff_days 0.04767 0.2183 0.26
## Residual 0.31214 0.5587
## Number of obs: 48, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.49089 0.26400 11.29953 -1.859 0.0892 .
## R_Thalamus_mean -0.07546 0.19730 11.59945 -0.382 0.7090
## scannerTerra 0.15601 0.40819 11.73509 0.382 0.7091
## Sex_1M 0.21152 0.14110 12.14876 1.499 0.1594
## age_scan 0.20002 0.16522 11.18974 1.211 0.2510
## base_dxother -1.25001 0.78355 11.55364 -1.595 0.1376
## base_dxpro 0.23577 0.36901 11.26722 0.639 0.5357
## base_dxpsych 0.58626 0.53476 14.20238 1.096 0.2912
## psych.rx_life1 1.18544 0.50345 21.21613 2.355 0.0282 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) R_Thl_ scnnrT Sex_1M ag_scn bs_dxt bs_dxpr bs_dxps
## R_Thalms_mn 0.212
## scannerTerr -0.364 -0.708
## Sex_1M 0.198 -0.115 0.270
## age_scan 0.237 -0.369 0.117 0.033
## base_dxothr -0.271 0.316 -0.364 -0.340 -0.369
## base_dxpro -0.522 0.340 -0.415 -0.379 -0.371 0.495
## bas_dxpsych -0.493 -0.012 0.043 -0.265 -0.159 0.431 0.360
## psych.rx_l1 0.062 0.107 -0.174 0.038 -0.114 -0.207 0.082 -0.704
Anova(lmer.5)
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: sops_p
## Chisq Df Pr(>Chisq)
## R_Thalamus_mean 0.1463 1 0.70211
## scanner 0.1461 1 0.70230
## Sex_1M 2.2472 1 0.13386
## age_scan 1.4655 1 0.22606
## base_dx 8.1330 3 0.04334 *
## psych.rx_life 5.5444 1 0.01854 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Even though psych.rx_life is very predictive of sops_p at a given timepoint, is not so much a predictor of interest (since if you know whether someone is taking psychiatric meds you probably also know their positive sxs); it’s included because we want to control for it when testing the significance of the other predictors (because meds may affect glucest contrast and/or sops positive score).
Switching out baseline sops positive (base_sops_p) for base_dx results in a singular fit.
lmer.6 <- lmer(sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_sops_p + psych.rx_life + (1 + sips_diff_days|BBLID), REML = T, data = final.df_scaled )
## boundary (singular) fit: see ?isSingular
# summary(lmer.6)
# Anova(lmer.6)
Finally, I’ll fit a full model without CEST data for comparison.
lmer.7 <- lmer(sops_p ~ Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days|BBLID), REML = T, data = final.df_scaled )
summary(lmer.7)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## sops_p ~ Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days |
## BBLID)
## Data: final.df_scaled
##
## REML criterion at convergence: 99.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.24856 -0.56892 -0.06649 0.58064 2.15616
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## BBLID (Intercept) 0.1327 0.3643
## sips_diff_days 0.0442 0.2102 0.23
## Residual 0.3153 0.5616
## Number of obs: 48, groups: BBLID, 21
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.4490 0.2287 13.4203 -1.963 0.0707 .
## Sex_1M 0.1993 0.1263 14.6326 1.578 0.1359
## age_scan 0.1797 0.1391 12.3103 1.292 0.2200
## base_dxother -1.1620 0.6818 14.1460 -1.704 0.1102
## base_dxpro 0.2917 0.3116 12.9301 0.936 0.3663
## base_dxpsych 0.5403 0.5003 15.2687 1.080 0.2969
## psych.rx_life1 1.2589 0.4699 21.3073 2.679 0.0139 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Sex_1M ag_scn bs_dxt bs_dxpr bs_dxps
## Sex_1M 0.342
## age_scan 0.305 0.057
## base_dxothr -0.460 -0.285 -0.344
## base_dxpro -0.794 -0.320 -0.354 0.399
## bas_dxpsych -0.510 -0.295 -0.166 0.486 0.414
## psych.rx_l1 -0.003 0.095 -0.118 -0.306 0.011 -0.710
Anova(lmer.7)
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: sops_p
## Chisq Df Pr(>Chisq)
## Sex_1M 2.4904 1 0.114543
## age_scan 1.6696 1 0.196316
## base_dx 9.1369 3 0.027525 *
## psych.rx_life 7.1766 1 0.007386 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Using anova to compare nested model fits:
anova(lmer.4, lmer.5, test="Chisq")
## refitting model(s) with ML (instead of REML)
## Data: final.df_scaled
## Models:
## lmer.4: sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx + (1 + sips_diff_days | BBLID)
## lmer.5: sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days | BBLID)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## lmer.4 12 121.10 143.55 -48.549 97.098
## lmer.5 13 116.02 140.35 -45.011 90.022 7.0754 1 0.007815 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lmer.5, lmer.7, test="Chisq")
## refitting model(s) with ML (instead of REML)
## Data: final.df_scaled
## Models:
## lmer.7: sops_p ~ Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days | BBLID)
## lmer.5: sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days | BBLID)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## lmer.7 11 112.44 133.02 -45.219 90.437
## lmer.5 13 116.02 140.35 -45.011 90.022 0.4151 2 0.8126
lmer.5 (adding psych.rx_life) unsurprisingly significantly improved the fit significantly. There was no significant difference between lmer.5 and lmer.7, suggesting that adding thalamic CEST values do not improve the model.
“The residual variance estimate can be thought of as the within groups variance and each random effect variance estimate can be thought of as a between groups estimate”
Diagnostics
#linearity - plotting residuals
plot(lmer.5)
#normality - plotting quantiles
qqnorm(residuals(lmer.5))
Heteroscedastic - model fits less well at higher (and more important) sops_p scores
Removed because it resulted in irritating and uninterpretable inf values.
sops_change <- sops_change %>%
group_by(bblid_scan) %>%
arrange(scan.time) %>%
mutate(pct.delta.sops_p = abs.delta.sops_p/first(sops_p) * 100,
pct.delta.sops_n = abs.delta.sops_n/first(sops_n) * 100,
pct.delta.sops_d = abs.delta.sops_d/first(sops_d) * 100,
pct.delta.sops_g = abs.delta.sops_g/first(sops_g) * 100,
pct.delta.sops_tot = abs.delta.sops_tot/first(sops_tot) * 100,
pct.delta.gacf = abs.delta.gafc/first(GAF_C) * 100)
Merge and keep med info at time of scan as well as time of each SOPS endpoint assessment.
sops_change_meds <- left_join(sops_change_dx, medswide, by="BBLID")
#calculate datediff from meds to scan/SIPS
sops_change_meds <- sops_change_meds %>%
mutate(meds_7t_diff = as.numeric(difftime(DOCOLLECT, date7t, units = "days")),
meds_sops_diff = as.numeric(difftime(DOCOLLECT, DOSIPS, units = "days")))
#ok, now pull the baseline meds and set them aside for now
meds_baseline <- sops_change_meds %>%
arrange(abs(meds_7t_diff)) %>%
group_by(observation) %>%
slice_head() %>% #keep only closest dx
ungroup() %>%
dplyr::select(observation, meds_7t_diff, DOCOLLECT | contains("MEDICINE_") | contains("NOTES_") | contains("IS_CURRENT_")) %>% #drop non-med stuff
dplyr::select_if(~!all(is.na(.))) %>% #drop any col that's totally empty
rename_with(.cols = 3:ncol(.), function(x){paste0("base.", x)}) #add prefix
#go back and filter down to endpoint meds
meds_end <- sops_change_meds %>%
arrange(abs(meds_sops_diff)) %>%
group_by(observation) %>%
slice_head() %>% #keep only closest dx
ungroup() %>%
dplyr::select(observation, meds_sops_diff, DOCOLLECT | contains("MEDICINE_") | contains("NOTES_") | contains("IS_CURRENT_")) %>% #drop non-med stuff
dplyr::select_if(~!all(is.na(.)))
#plots to see the range of closest med info
sops.med.p <- meds_end %>%
ggplot() +
geom_histogram(aes(x = meds_sops_diff), position="identity", color="lightblue", fill="lightblue")
scan.med.p <- meds_baseline %>%
ggplot() +
geom_histogram(aes(x = meds_7t_diff), position="identity", color="lightgreen", fill="lightgreen")
# put p1 and p2 together
ggarrange(sops.med.p, scan.med.p, common.legend = T, legend = "bottom", ncol = 2)
Ok, maybe let’s restrict to +- 100 days and add back in. Also tried 200 days but didn’t add anything
#filter down
meds_baseline <- meds_baseline %>%
filter(abs(meds_7t_diff) <= 100) %>%
mutate(across(c(contains("MEDICINE_")), factor)) #also setting meds as factors to see frequency later
meds_end <- meds_end %>%
filter(abs(meds_sops_diff) <= 100) %>%
mutate(across(c(contains("MEDICINE_")), factor)) #also setting meds as factors to see frequency later
#merge base and end meds together
meds_filt <- full_join(meds_baseline, meds_end, by="observation")
sum(is.na(meds_filt$meds_7t_diff))
sum(is.na(meds_filt$meds_sops_diff))
#merging meds with sops_change_dx b/c sops_change_meds already has med stuff
sops_change_meds.final <- left_join(sops_change_dx, meds_filt, by="observation")
#make sure no obs lost
n_unique(sops_change_meds.final$observation)
#skim(sops_change_meds.final)
Ok, now how to summarize all this medication info into one cohesive value
#see how often each med is reported
sops_change_meds.final %>%
dplyr::select(contains("MEDICINE")) %>%
unlist() %>%
table()
NOTES: CARVEDILOL = beta blocker (anxiety?) CLONIDINE = hypertension, ADHD? DEXMETHYLPHENIDATE HYDROCHLORIDE = ADHD NORETHINDRONE ACETATE = progestin RANITIDINE = antihistamine/antacid TRILOMARZIA = birth control ALBUTEROL = asthma DEXEDRINE = ADHD DICLOFENAC = arthritis ADVAIR = asthma
ideas - use 0,1 indicator for each class of medication at each timepoint? could be granular (asthma, allergies, stimulants) or more broad (psych_rx, other_rx) - use that indicator but lifetime?
keep if start/end dates of med are < date > ? lifetime indicator - dont filter by date assessed, just if date-rx-start is < endpoint date?
Test using the DOMED_START and DOMED_END to only capture meds that are explicitly current for 7T scan
#merge
scan_date_med <- inner_join(scan_dates, meds_reduced, by="BBLID")
#ok, keep only rows where med is current to 7T
scan_date_med_current <- scan_date_med %>%
filter(is.na(DOMED_START) | date7t > DOMED_START) %>% #keep if med is STARTED BEFORE 7t
filter(is.na(DOMED_START) | date7t < DOMED_END) %>% #keep if med was ENDED AFTER 7t
mutate(med_7t_diff = as.numeric(difftime(DOCOLLECT,date7t, units = "days"))) %>%
filter(!(is.na(DOMED_START) & (abs(med_7t_diff) > 100))) %>% #only keep NAs if DOCOLLECT is w/in 100 days of scan
group_by(observation) %>%
mutate(count = row_number()) %>% #add counts so i can pivot after
ungroup() %>%
rename_with(.cols = 4:ncol(.), function(x){paste0("base.", x)}) #add prefix
scan_date_med_current$base.NOTES[is.na(scan_date_med_current$base.NOTES)] <- "empty entry"
skim(scan_date_med_current)
#pivot wide
scan_meds_current_wide <- pivot_wider(scan_date_med_current, id_cols=observation, names_from = base.count, values_from=c(4:12))
#just keep essential cols
scan.meds.current_wide.filt <- scan_meds_current_wide %>%
dplyr::select(observation | contains("MEDICINE_") | contains("NOTES_")) %>%
dplyr::select_if(~!all(is.na(.))) #drop any col that's totally empty
In the meantime I’ll try to find meds current to endpoint SOPs.
#ok, keep only rows where med is current to 7T
end_date_med_current <- end_date_med %>%
filter(is.na(DOMED_START) | DOSIPS > DOMED_START) %>% #keep if med is STARTED BEFORE SOPS end
filter(is.na(DOMED_START) | DOSIPS < DOMED_END) %>% #keep if med was ENDED AFTER SOPS end
mutate(med_sops_diff = as.numeric(difftime(DOCOLLECT, DOSIPS, units = "days"))) %>%
filter(!(is.na(DOMED_START) & (abs(med_sops_diff) > 100))) %>% #only keep NAs if DOCOLLECT is w/in 100 days of scan
group_by(observation) %>%
mutate(count = row_number()) %>% #add counts so i can pivot after
ungroup()
end_date_med_current$NOTES[is.na(end_date_med_current$NOTES)] <- "empty entry"
skim(end_date_med_current)
#pivot wide
end_meds_current_wide <- pivot_wider(end_date_med_current, id_cols=observation, names_from = count, values_from=c(4:12))
#just keep essential cols
end.meds.current_wide.filt <- end_meds_current_wide %>%
dplyr::select(observation | contains("MEDICINE_") | contains("NOTES_")) %>%
dplyr::select_if(~!all(is.na(.))) #drop any col that's totally empty
This seems to be working, lets try merging baseline and endpoint meds and see what we have
currentmeds <- right_join(scan.meds.current_wide.filt, end.meds.current_wide.filt, by="observation")
Actually very narrow range of meds here, but the empty entry notes indicate either no medications or no new medications. Therefore I’ll need to look back at participant’s former meds
#Wide Data
Ok, now we calculate change from baseline to each endpoint (absolute and slope). Change is defined as final score - baseline score, so a negative value means final<baseline which means sxs improved! (except GAFC, where positive -> sxs improved)
#calculate change
sops_change <- scanlist_clean %>%
group_by(bblid_scan) %>%
arrange(scan.time) %>%
mutate(abs.delta.sops_p = sops_p - first(sops_p), #change=final-baseline -> negative value means score improves!
abs.delta.sops_n = sops_n - first(sops_n),
abs.delta.sops_d = sops_d - first(sops_d),
abs.delta.sops_g = sops_g - first(sops_g),
abs.delta.sops_tot = sops_tot - first(sops_tot),
abs.delta.gafc = GAF_C - first(GAF_C),
delta.sops_time = as.numeric(difftime(DOSIPS,first(DOSIPS), units = "days")),
slope.sops_p = abs.delta.sops_p/delta.sops_time,
slope.sops_n = abs.delta.sops_n/delta.sops_time,
slope.sops_d = abs.delta.sops_d/delta.sops_time,
slope.sops_g = abs.delta.sops_g/delta.sops_time,
slope.sops_tot = abs.delta.sops_tot/delta.sops_time,
slope.gafc = abs.delta.gafc/delta.sops_time,
observation = paste(bblid_scan, scan.time)) %>% #add unique ID for each BBLID-scan-endpoint#
ungroup() %>%
dplyr::select(!c(sops.item.list, sops_p, sops_n, sops_d, sops_g, sops_tot)) %>% #drop raw scores
filter(!(scan.time=="base.1"))
#double check to make sure no endpoint scans come before baseline scans
summary(sops_change$delta.sops_time)
#just one row per BBLID-scan-endpoint#
any(duplicated(sops_change$observation))
n_unique(sops_change$observation)
#fun plots
ggplot(sops_change, aes(x=slope.sops_tot)) + geom_histogram()
ggplot(sops_change, aes(x=slope.sops_n)) + geom_histogram()
ggplot(sops_change, aes(x=slope.gafc)) + geom_histogram()
scanlist_clean %>%
ggplot(aes(x = sips_diff_days, y = sops_tot, color = bblid_scan)) +
geom_line() +
theme_bw()
linear model for fun
bs.fit <- lm(slope.sops_tot ~ PRODROMAL, data=sops_change)
Anova(bs.fit)